diff --git a/ais/backend/common.go b/ais/backend/common.go index c24ab4b2a1..4938fdf99e 100644 --- a/ais/backend/common.go +++ b/ais/backend/common.go @@ -44,7 +44,7 @@ func (b *base) init(snode *meta.Snode, tr stats.Tracker) { stats.KindCounter, &stats.Extra{ Help: "GET: total number of executed remote requests (cold GETs)", - StrName: "remote_get_n", + StrName: "remote_get_count", Labels: labels, }, ) @@ -52,7 +52,7 @@ func (b *base) init(snode *meta.Snode, tr stats.Tracker) { b.metrics[stats.GetLatencyTotal], stats.KindTotal, &stats.Extra{ - Help: "GET: total cumulative time (nanoseconds) to execute cold GETs and store new content in-cluster", + Help: "GET: total cumulative time (nanoseconds) to execute cold GETs and store new object versions in-cluster", StrName: "remote_get_ns_total", Labels: labels, }, @@ -61,7 +61,8 @@ func (b *base) init(snode *meta.Snode, tr stats.Tracker) { b.metrics[stats.GetE2ELatencyTotal], stats.KindTotal, &stats.Extra{ - Help: "GET: total end-to-end time (nanoseconds) servicing remote requests; includes: cold GET, store in-cluster, transmit response", + Help: "GET: total end-to-end time (nanoseconds) servicing remote requests; " + + "includes: receiving request, executing cold-GET, storing new object version in-cluster, and transmitting response", StrName: "remote_e2e_get_ns_total", Labels: labels, }, @@ -69,7 +70,10 @@ func (b *base) init(snode *meta.Snode, tr stats.Tracker) { tr.RegExtMetric(snode, b.metrics[stats.GetSize], stats.KindSize, - &stats.Extra{Help: "GET: total cumulative size (bytes) of all cold GET operations", StrName: "remote_get_bytes_total", Labels: labels}, + &stats.Extra{ + Help: "GET: total cumulative size (bytes) of all cold-GET transactions", + StrName: "remote_get_bytes_total", + Labels: labels}, ) // PUT @@ -81,13 +85,17 @@ func (b *base) init(snode *meta.Snode, tr stats.Tracker) { tr.RegExtMetric(snode, b.metrics[stats.PutCount], stats.KindCounter, - &stats.Extra{Help: "PUT: total number of executed remote requests", StrName: "remote_put_n", Labels: labels}, + &stats.Extra{ + Help: "PUT: total number of executed remote requests to a given backend", + StrName: "remote_put_count", + Labels: labels, + }, ) tr.RegExtMetric(snode, b.metrics[stats.PutLatencyTotal], stats.KindTotal, &stats.Extra{ - Help: "PUT: total cumulative time (nanoseconds) to execute remote requests and store new content in-cluster", + Help: "PUT: total cumulative time (nanoseconds) to execute remote requests and store new object versions in-cluster", StrName: "remote_put_ns_total", Labels: labels, }, @@ -97,13 +105,18 @@ func (b *base) init(snode *meta.Snode, tr stats.Tracker) { stats.KindTotal, &stats.Extra{ StrName: "remote_e2e_put_ns_total", - Help: "PUT: total end-to-end time (nanoseconds) servicing remote requests; includes: store in-cluster, execute remote PUT", - Labels: labels}, + Help: "PUT: total end-to-end time (nanoseconds) servicing remote requests; " + + "includes: receiving PUT payload, storing it in-cluster, executing remote PUT, finalizing new in-cluster object", + Labels: labels}, ) tr.RegExtMetric(snode, b.metrics[stats.PutSize], stats.KindSize, - &stats.Extra{Help: "PUT: total cumulative size (bytes) of all PUTs to remote backend", StrName: "remote_e2e_put_bytes_total", Labels: labels}, + &stats.Extra{ + Help: "PUT: total cumulative size (bytes) of all PUTs to a given remote backend", + StrName: "remote_e2e_put_bytes_total", + Labels: labels, + }, ) // version changed out-of-band @@ -115,7 +128,7 @@ func (b *base) init(snode *meta.Snode, tr stats.Tracker) { stats.KindCounter, &stats.Extra{ Help: "number of out-of-band updates (by a 3rd party performing remote PUTs outside this cluster)", - StrName: "remote_ver_change_n", + StrName: "remote_ver_change_count", Labels: labels, }, ) diff --git a/stats/common.go b/stats/common.go index 5ab1d84ae7..2a8a57d82c 100644 --- a/stats/common.go +++ b/stats/common.go @@ -7,7 +7,6 @@ package stats import ( "encoding/json" - "fmt" rfs "io/fs" "os" "path/filepath" @@ -92,6 +91,7 @@ const ( ErrPutMirrorCount = errPrefix + "put.mirror.n" // KindLatency + // latency stats have numSamples used to compute average latency GetLatency = "get.ns" GetLatencyTotal = "get.ns.total" GetE2ELatencyTotal = "e2e.get.ns.total" // // e2e cold-GET latency @@ -117,19 +117,6 @@ type ( // primitives: values and maps type ( - // Stats are tracked via a map of stats names (key) to statsValue (values). - // There are two main types of stats: counter and latency declared - // using the the kind field. Only latency stats have numSamples used to compute latency. - statsValue struct { - kind string // enum { KindCounter, ..., KindSpecial } - label struct { - comm string // common part of the metric label (as in: . comm . ) - stpr string // StatsD _or_ Prometheus label (depending on build tag) - } - Value int64 `json:"v,string"` - numSamples int64 // (log + StatsD) only - cumulative int64 - } copyValue struct { Value int64 `json:"v,string"` } @@ -174,76 +161,76 @@ func (r *runner) regCommon(snode *meta.Snode) { // basic counters r.reg(snode, GetCount, KindCounter, &Extra{ - Help: "number of executed GET(object) requests", + Help: "total number of executed GET(object) requests", }, ) r.reg(snode, PutCount, KindCounter, &Extra{ - Help: "number of executed PUT(object) requests", + Help: "total number of executed PUT(object) requests", }, ) r.reg(snode, AppendCount, KindCounter, &Extra{ - Help: "number of executed APPEND(object) requests", + Help: "total number of executed APPEND(object) requests", }, ) r.reg(snode, DeleteCount, KindCounter, &Extra{ - Help: "number of executed DELETE(object) requests", + Help: "total number of executed DELETE(object) requests", }, ) r.reg(snode, RenameCount, KindCounter, &Extra{ - Help: "number of executed Rename(object) requests", + Help: "total number of executed rename(object) requests", }, ) r.reg(snode, ListCount, KindCounter, &Extra{ - Help: "number of executed list-objects requests", + Help: "total number of executed list-objects requests", }, ) // basic error counters, respectively r.reg(snode, ErrGetCount, KindCounter, &Extra{ - Help: "number of GET(object) errors", + Help: "total number of GET(object) errors", }, ) r.reg(snode, ErrPutCount, KindCounter, &Extra{ - Help: "number of PUT(object) errors", + Help: "total number of PUT(object) errors", }, ) r.reg(snode, ErrAppendCount, KindCounter, &Extra{ - Help: "number of APPEND(object) errors", + Help: "total number of APPEND(object) errors", }, ) r.reg(snode, ErrDeleteCount, KindCounter, &Extra{ - Help: "number of DELETE(object) errors", + Help: "total number of DELETE(object) errors", }, ) r.reg(snode, ErrRenameCount, KindCounter, &Extra{ - Help: "number of Rename(object) errors", + Help: "total number of rename(object) errors", }, ) r.reg(snode, ErrListCount, KindCounter, &Extra{ - Help: "number of list-objects errors", + Help: "total number of list-objects errors", }, ) // even more error counters r.reg(snode, ErrHTTPWriteCount, KindCounter, &Extra{ - Help: "number of HTTP write-response errors", + Help: "total number of HTTP write-response errors", }, ) r.reg(snode, ErrDownloadCount, KindCounter, &Extra{ - Help: "number of download errors", + Help: "downloader: number of download errors", }, ) r.reg(snode, ErrPutMirrorCount, KindCounter, @@ -255,31 +242,30 @@ func (r *runner) regCommon(snode *meta.Snode) { // basic latencies r.reg(snode, GetLatency, KindLatency, &Extra{ - Help: "total cumulative time (nanoseconds) to execute GET requests " + - "(in the logs: average GET latency over the last stats-time interval)", + Help: "GET: average time (milliseconds) over the last periodic.stats_time interval", }, ) r.reg(snode, GetLatencyTotal, KindTotal, &Extra{ - Help: "total cumulative time (nanoseconds) to execute GET requests", + Help: "GET: total cumulative time (nanoseconds)", }, ) r.reg(snode, ListLatency, KindLatency, &Extra{ - Help: "total cumulative time (nanoseconds) to execute list-objects requests " + - "(in the logs: average list-objects latency over the last stats-time interval)", + Help: "list-objects: average time (milliseconds) over the last periodic.stats_time interval", }, ) r.reg(snode, KeepAliveLatency, KindLatency, &Extra{ - Help: "in-cluster keep-alive latency (heartbeat latency, nanoseconds)", + Help: "in-cluster keep-alive (heartbeat): average time (milliseconds) over the last periodic.stats_time interval", }, ) // special uptime r.reg(snode, Uptime, KindSpecial, &Extra{ - Help: "this node's uptime since startup (nanoseconds)", + Help: "this node's uptime since its startup (seconds)", + StrName: "uptime", }, ) @@ -292,57 +278,6 @@ func (r *runner) regCommon(snode *meta.Snode) { ) } -// naming convention: ".n" for the count and ".ns" for duration (nanoseconds) -// compare with coreStats.initProm() -func (r *runner) reg(snode *meta.Snode, name, kind string, extra *Extra) { - v := &statsValue{kind: kind} - f := func(units string) string { - return fmt.Sprintf("%s.%s.%s.%s", "ais"+snode.Type(), snode.ID(), v.label.comm, units) - } - switch kind { - case KindCounter: - debug.Assert(strings.HasSuffix(name, ".n"), name) // naming convention - v.label.comm = strings.TrimSuffix(name, ".n") - v.label.comm = strings.ReplaceAll(v.label.comm, ":", "_") - v.label.stpr = f("count") - case KindTotal: - debug.Assert(strings.HasSuffix(name, ".total"), name) // naming convention - v.label.comm = strings.TrimSuffix(name, ".total") - v.label.comm = strings.ReplaceAll(v.label.comm, ":", "_") - v.label.stpr = f("total") - case KindSize: - debug.Assert(strings.HasSuffix(name, ".size"), name) // naming convention - v.label.comm = strings.TrimSuffix(name, ".size") - v.label.comm = strings.ReplaceAll(v.label.comm, ":", "_") - v.label.stpr = f("mbytes") - case KindLatency: - debug.Assert(strings.Contains(name, ".ns"), name) // ditto - v.label.comm = strings.TrimSuffix(name, ".ns") - v.label.comm = strings.ReplaceAll(v.label.comm, ".ns.", ".") - v.label.comm = strings.ReplaceAll(v.label.comm, ":", "_") - v.label.stpr = f("ms") - case KindThroughput, KindComputedThroughput: - debug.Assert(strings.HasSuffix(name, ".bps"), name) // ditto - v.label.comm = strings.TrimSuffix(name, ".bps") - v.label.comm = strings.ReplaceAll(v.label.comm, ":", "_") - v.label.stpr = f("mbps") - default: - debug.Assert(kind == KindGauge || kind == KindSpecial) - v.label.comm = name - v.label.comm = strings.ReplaceAll(v.label.comm, ":", "_") - if name == Uptime { - v.label.comm = strings.ReplaceAll(v.label.comm, ".ns.", ".") - v.label.stpr = f("seconds") - } else { - v.label.stpr = fmt.Sprintf("%s.%s.%s", "ais"+snode.Type(), snode.ID(), v.label.comm) - } - } - r.core.Tracker[name] = v - - // no-op for StatsD - r.regProm(snode, name, extra, v) -} - // // as cos.StatsUpdater // diff --git a/stats/common_prom.go b/stats/common_prom.go index 47ac497643..23067d0964 100644 --- a/stats/common_prom.go +++ b/stats/common_prom.go @@ -26,6 +26,14 @@ import ( type ( promDesc map[string]*prometheus.Desc + // Stats are tracked via a map of stats names (key) and statsValue (values). + statsValue struct { + kind string // enum { KindCounter, ..., KindSpecial } + Value int64 `json:"v,string"` + numSamples int64 // (average latency over stats_time) + cumulative int64 // REST API + } + coreStats struct { Tracker map[string]*statsValue promDesc promDesc @@ -101,7 +109,7 @@ func (s *coreStats) update(nv cos.NamedVal64) { } } -// log + StatsD (Prometheus is done separately via `Collect`) +// usage: log resulting `copyValue` numbers: func (s *coreStats) copyT(out copyTracker, diskLowUtil ...int64) bool { idle := true intl := max(int64(s.statsTime.Seconds()), 1) @@ -111,7 +119,7 @@ func (s *coreStats) copyT(out copyTracker, diskLowUtil ...int64) bool { case KindLatency: var lat int64 if num := ratomic.SwapInt64(&v.numSamples, 0); num > 0 { - lat = ratomic.SwapInt64(&v.Value, 0) / num + lat = ratomic.SwapInt64(&v.Value, 0) / num // NOTE: log average latency (nanoseconds) over the last "periodic.stats_time" interval if !ignore(name) { idle = false } @@ -120,7 +128,7 @@ func (s *coreStats) copyT(out copyTracker, diskLowUtil ...int64) bool { case KindThroughput: var throughput int64 if throughput = ratomic.SwapInt64(&v.Value, 0); throughput > 0 { - throughput /= intl + throughput /= intl // NOTE: log average throughput (bps) over the last "periodic.stats_time" interval if !ignore(name) { idle = false } @@ -158,6 +166,8 @@ func (s *coreStats) copyT(out copyTracker, diskLowUtil ...int64) bool { } // REST API what=stats query +// NOTE: reporting total cumulative values to compute throughput and latency by the client +// based on their respective time interval and request counts // NOTE: not reporting zero counts func (s *coreStats) copyCumulative(ctracker copyTracker) { for name, v := range s.Tracker { @@ -218,65 +228,45 @@ var ( _ prometheus.Collector = (*runner)(nil) ) -// TODO -- FIXME: remove -func _foobar(id, name string, v *statsValue) (metricName, help string) { - label := name - label = strings.ReplaceAll(label, ".", "_") - // prometheus metrics names shouldn't include daemonID. - label = strings.ReplaceAll(label, "_"+id+"_", "_") - v.label.stpr = strings.ReplaceAll(label, ":", "_") - - help = v.kind - if strings.HasSuffix(v.label.stpr, "_n") { - help = "total number of operations" - } else if strings.HasSuffix(v.label.stpr, "_size") { - help = "total size (bytes)" - } else if strings.HasSuffix(v.label.stpr, "avg_rsize") { - help = "average read size (bytes)" - } else if strings.HasSuffix(v.label.stpr, "avg_wsize") { - help = "average write size (bytes)" - } else if strings.HasSuffix(v.label.stpr, "_ns") { - v.label.stpr = strings.TrimSuffix(v.label.stpr, "_ns") + "_ms" - help = "latency (milliseconds)" - } else if strings.HasSuffix(v.label.stpr, "_ns_total") { - help = "cumulative latency (nanoseconds)" - } else if strings.Contains(v.label.stpr, "_ns_") { - v.label.stpr = strings.ReplaceAll(v.label.stpr, "_ns_", "_ms_") - if name == Uptime { - v.label.stpr = strings.ReplaceAll(v.label.stpr, "_ns_", "") - help = "uptime (seconds)" - } else { - help = "latency (milliseconds)" - } - } else if strings.HasSuffix(v.label.stpr, "_bps") { - v.label.stpr = strings.TrimSuffix(v.label.stpr, "_bps") + "_mbps" - help = "throughput (MB/s)" - } - return v.label.stpr, help -} +func (r *runner) reg(snode *meta.Snode, name, kind string, extra *Extra) { + v := &statsValue{kind: kind} + r.core.Tracker[name] = v -func (r *runner) regProm(snode *meta.Snode, name string, extra *Extra, v *statsValue) { var ( metricName string help string constLabels = dfltLabels ) - if extra != nil { - if len(extra.Labels) > 0 { - constLabels = prometheus.Labels(extra.Labels) - constLabels["node_id"] = dfltLabels["node_id"] - } - if extra.StrName == "" { - metricName = strings.ReplaceAll(name, ".", "_") - } else { - metricName = extra.StrName + debug.Assert(extra != nil) + debug.Assert(extra.Help != "") + if len(extra.Labels) > 0 { + constLabels = prometheus.Labels(extra.Labels) + constLabels["node_id"] = dfltLabels["node_id"] + } + if extra.StrName == "" { + // when not explicitly specified: generate prometheus name + // from an internal name (compare with common_statsd reg() impl.) + switch kind { + case KindCounter: + debug.Assert(strings.HasSuffix(name, ".n"), name) + metricName = strings.TrimSuffix(name, ".n") + "_count" + case KindSize: + debug.Assert(strings.HasSuffix(name, ".size"), name) + metricName = strings.TrimSuffix(name, ".size") + "_bytes" + case KindLatency: + debug.Assert(strings.HasSuffix(name, ".ns"), name) + metricName = strings.TrimSuffix(name, ".ns") + "_ms" + case KindThroughput, KindComputedThroughput: + debug.Assert(strings.HasSuffix(name, ".bps"), name) + metricName = strings.TrimSuffix(name, ".bps") + "_mbps" + default: + metricName = name } - help = extra.Help + metricName = strings.ReplaceAll(metricName, ".", "_") } else { - // TODO -- FIXME: remove - id := strings.ReplaceAll(snode.ID(), ".", "_") - metricName, help = _foobar(id, name, v) + metricName = extra.StrName } + help = extra.Help fullqn := prometheus.BuildFQName("ais" /*namespace*/, snode.Type() /*subsystem*/, metricName) r.core.promDesc[name] = prometheus.NewDesc(fullqn, help, nil /*variableLabels*/, constLabels) diff --git a/stats/common_statsd.go b/stats/common_statsd.go index 3473cfafb6..7a2a2074a9 100644 --- a/stats/common_statsd.go +++ b/stats/common_statsd.go @@ -9,6 +9,7 @@ package stats import ( "encoding/json" + "fmt" "os" "strings" ratomic "sync/atomic" @@ -27,6 +28,18 @@ import ( type ( metric = statsd.Metric // type alias + // Stats are tracked via a map of stats names (key) and statsValue (values). + statsValue struct { + kind string // enum { KindCounter, ..., KindSpecial } + label struct { + comm string // common part of the metric label (as in: . comm . ) + stpr string // StatsD _or_ Prometheus label (depending on build tag) + } + Value int64 `json:"v,string"` + numSamples int64 // (average latency over stats_time) + cumulative int64 // REST API + } + coreStats struct { Tracker map[string]*statsValue statsdC *statsd.Client @@ -136,7 +149,7 @@ func (s *coreStats) update(nv cos.NamedVal64) { } } -// log + StatsD (Prometheus is done separately via `Collect`) +// usage: log and StatsD Tx func (s *coreStats) copyT(out copyTracker, diskLowUtil ...int64) bool { idle := true intl := max(int64(s.statsTime.Seconds()), 1) @@ -146,13 +159,14 @@ func (s *coreStats) copyT(out copyTracker, diskLowUtil ...int64) bool { case KindLatency: var lat int64 if num := ratomic.SwapInt64(&v.numSamples, 0); num > 0 { - lat = ratomic.SwapInt64(&v.Value, 0) / num + lat = ratomic.SwapInt64(&v.Value, 0) / num // NOTE: log average latency (nanoseconds) over the last "periodic.stats_time" interval if !ignore(name) { idle = false } } out[name] = copyValue{lat} - // NOTE: ns => ms, and not reporting zeros + + // NOTE: if not zero, report StatsD latency (milliseconds) over the last "periodic.stats_time" interval millis := cos.DivRound(lat, int64(time.Millisecond)) if !s.statsdDisabled() && millis > 0 { s.statsdC.AppMetric(metric{Type: statsd.Timer, Name: v.label.stpr, Value: float64(millis)}, s.sgl) @@ -160,7 +174,7 @@ func (s *coreStats) copyT(out copyTracker, diskLowUtil ...int64) bool { case KindThroughput: var throughput int64 if throughput = ratomic.SwapInt64(&v.Value, 0); throughput > 0 { - throughput /= intl + throughput /= intl // NOTE: log average throughput (bps) over the last "periodic.stats_time" interval if !ignore(name) { idle = false } @@ -280,8 +294,48 @@ func (s *coreStats) reset(errorsOnly bool) { // runner // //////////// -// empty stub -func (*runner) regProm(*meta.Snode, string, *Extra, *statsValue) {} +// naming convention: ".n" for the count and ".ns" for duration (nanoseconds) +// compare with coreStats.initProm() +func (r *runner) reg(snode *meta.Snode, name, kind string, _ *Extra) { + v := &statsValue{kind: kind} + f := func(units string) string { + return fmt.Sprintf("%s.%s.%s.%s", "ais"+snode.Type(), snode.ID(), v.label.comm, units) + } + debug.Assert(!strings.Contains(name, ":"), name) + switch kind { + case KindCounter: + debug.Assert(strings.HasSuffix(name, ".n"), name) + v.label.comm = strings.TrimSuffix(name, ".n") + v.label.stpr = f("count") + case KindTotal: + debug.Assert(strings.HasSuffix(name, ".total"), name) + v.label.comm = strings.TrimSuffix(name, ".total") + v.label.stpr = f("total") + case KindSize: + debug.Assert(strings.HasSuffix(name, ".size"), name) + v.label.comm = strings.TrimSuffix(name, ".size") + v.label.stpr = f("bytes") + case KindLatency: + debug.Assert(strings.HasSuffix(name, ".ns"), name) + v.label.comm = strings.TrimSuffix(name, ".ns") + v.label.comm = strings.ReplaceAll(v.label.comm, ":", "_") + v.label.stpr = f("ms") + case KindThroughput, KindComputedThroughput: + debug.Assert(strings.HasSuffix(name, ".bps"), name) + v.label.comm = strings.TrimSuffix(name, ".bps") + v.label.stpr = f("mbps") + default: + debug.Assert(kind == KindGauge || kind == KindSpecial) + v.label.comm = name + if name == Uptime { + v.label.comm = "uptime" + v.label.stpr = f("seconds") + } else { + v.label.stpr = fmt.Sprintf("%s.%s.%s", "ais"+snode.Type(), snode.ID(), v.label.comm) + } + } + r.core.Tracker[name] = v +} func (*runner) IsPrometheus() bool { return false } diff --git a/stats/target_stats.go b/stats/target_stats.go index d4a4995216..835e413806 100644 --- a/stats/target_stats.go +++ b/stats/target_stats.go @@ -235,60 +235,210 @@ func isDiskUtilMetric(name string) bool { // target-specific metrics, in addition to common and already added via regCommon() func (r *Trunner) RegMetrics(snode *meta.Snode) { - r.reg(snode, LruEvictCount, KindCounter, nil) - r.reg(snode, LruEvictSize, KindSize, nil) + r.reg(snode, LruEvictCount, KindCounter, + &Extra{ + Help: "number of LRU evictions", + }, + ) + r.reg(snode, LruEvictSize, KindSize, + &Extra{ + Help: "total cumulative size (bytes) of LRU evictions", + }, + ) - r.reg(snode, CleanupStoreCount, KindCounter, nil) - r.reg(snode, CleanupStoreSize, KindSize, nil) + // removing $deleted objects is currently not counted + r.reg(snode, CleanupStoreCount, KindCounter, + &Extra{ + Help: "space cleanup: number of removed misplaced objects and old work files", + }, + ) + r.reg(snode, CleanupStoreSize, KindSize, + &Extra{ + Help: "space cleanup: total size (bytes) of all removed misplaced objects and old work files (not including removed deleted objects)", + }, + ) - r.reg(snode, VerChangeCount, KindCounter, nil) - r.reg(snode, VerChangeSize, KindSize, nil) + // out-of-band (x 3) + r.reg(snode, VerChangeCount, KindCounter, + &Extra{ + Help: "number of out-of-band updates (by a 3rd party performing remote PUTs from outside this cluster)", + }, + ) + r.reg(snode, VerChangeSize, KindSize, + &Extra{ + Help: "total cumulative size (bytes) of objects that were updated out-of-band across all backends combined", + }, + ) + r.reg(snode, RemoteDeletedDelCount, KindCounter, + &Extra{ + Help: "number of out-of-band deletes (by a 3rd party remote DELETE(object) from outside this cluster)", + }, + ) - r.reg(snode, PutLatency, KindLatency, nil) - r.reg(snode, PutLatencyTotal, KindTotal, nil) - r.reg(snode, AppendLatency, KindLatency, nil) - r.reg(snode, GetRedirLatency, KindLatency, nil) - r.reg(snode, PutRedirLatency, KindLatency, nil) + r.reg(snode, PutLatency, KindLatency, + &Extra{ + Help: "PUT: average time (milliseconds) over the last periodic.stats_time interval", + }, + ) + r.reg(snode, PutLatencyTotal, KindTotal, + &Extra{ + Help: "PUT: total cumulative time (nanoseconds)", + }, + ) + r.reg(snode, AppendLatency, KindLatency, + &Extra{ + Help: "APPEND(object): average time (milliseconds) over the last periodic.stats_time interval", + }, + ) + r.reg(snode, GetRedirLatency, KindLatency, + &Extra{ + Help: "GET: average gateway-to-target HTTP redirect latency (milliseconds) over the last periodic.stats_time interval", + }, + ) + r.reg(snode, PutRedirLatency, KindLatency, + &Extra{ + Help: "PUT: average gateway-to-target HTTP redirect latency (milliseconds) over the last periodic.stats_time interval", + }, + ) // bps - r.reg(snode, GetThroughput, KindThroughput, nil) - r.reg(snode, PutThroughput, KindThroughput, nil) + r.reg(snode, GetThroughput, KindThroughput, + &Extra{ + Help: "GET: average throughput (MB/s) over the last periodic.stats_time interval", + }, + ) + r.reg(snode, PutThroughput, KindThroughput, + &Extra{ + Help: "PUT: average throughput (MB/s) over the last periodic.stats_time interval", + }, + ) - r.reg(snode, GetSize, KindSize, nil) - r.reg(snode, PutSize, KindSize, nil) + r.reg(snode, GetSize, KindSize, + &Extra{ + Help: "GET: total cumulative size (bytes)", + }, + ) + r.reg(snode, PutSize, KindSize, + &Extra{ + Help: "PUT: total cumulative size (bytes)", + }, + ) // errors - r.reg(snode, ErrCksumCount, KindCounter, nil) - r.reg(snode, ErrCksumSize, KindSize, nil) - r.reg(snode, ErrFSHCCount, KindCounter, nil) + r.reg(snode, ErrCksumCount, KindCounter, + &Extra{ + Help: "number of executed GET(object) requests", + }, + ) + r.reg(snode, ErrCksumSize, KindSize, + &Extra{ + Help: "number of executed GET(object) requests", + }, + ) + r.reg(snode, ErrFSHCCount, KindCounter, + &Extra{ + Help: "number of times filesystem health checker (FSHC) was triggered by an I/O error or errors", + }, + ) - r.reg(snode, IOErrGetCount, KindCounter, nil) - r.reg(snode, IOErrPutCount, KindCounter, nil) - r.reg(snode, IOErrDeleteCount, KindCounter, nil) + r.reg(snode, IOErrGetCount, KindCounter, + &Extra{ + Help: "GET: number of I/O errors _not_ including remote backend and network errors", + }, + ) + r.reg(snode, IOErrPutCount, KindCounter, + &Extra{ + Help: "PUT: number of I/O errors _not_ including remote backend and network errors", + }, + ) + r.reg(snode, IOErrDeleteCount, KindCounter, + &Extra{ + Help: "DELETE(object): number of I/O errors _not_ including remote backend and network errors", + }, + ) // streams - r.reg(snode, cos.StreamsOutObjCount, KindCounter, nil) - r.reg(snode, cos.StreamsOutObjSize, KindSize, nil) - r.reg(snode, cos.StreamsInObjCount, KindCounter, nil) - r.reg(snode, cos.StreamsInObjSize, KindSize, nil) + r.reg(snode, cos.StreamsOutObjCount, KindCounter, + &Extra{ + Help: "intra-cluster streaming communications: number of sent objects", + }, + ) + r.reg(snode, cos.StreamsOutObjSize, KindSize, + &Extra{ + Help: "intra-cluster streaming communications: total cumulative size (bytes) of all transmitted objects", + }, + ) + r.reg(snode, cos.StreamsInObjCount, KindCounter, + &Extra{ + Help: "intra-cluster streaming communications: number of received objects", + }, + ) + r.reg(snode, cos.StreamsInObjSize, KindSize, + &Extra{ + Help: "intra-cluster streaming communications: total cumulative size (bytes) of all received objects", + }, + ) // download - r.reg(snode, DownloadSize, KindSize, nil) - r.reg(snode, DownloadLatency, KindLatency, nil) + r.reg(snode, DownloadSize, KindSize, + &Extra{ + Help: "total downloaded size (bytes)", + }, + ) + r.reg(snode, DownloadLatency, KindLatency, + &Extra{ + Help: "total time it took to execute dowload requests (milliseconds)", + }, + ) // dsort - r.reg(snode, DsortCreationReqCount, KindCounter, nil) - r.reg(snode, DsortCreationRespCount, KindCounter, nil) - r.reg(snode, DsortCreationRespLatency, KindLatency, nil) - r.reg(snode, DsortExtractShardDskCnt, KindCounter, nil) - r.reg(snode, DsortExtractShardMemCnt, KindCounter, nil) - r.reg(snode, DsortExtractShardSize, KindSize, nil) + r.reg(snode, DsortCreationReqCount, KindCounter, + &Extra{ + Help: "dsort: see https://github.com/NVIDIA/aistore/blob/main/docs/dsort.md#metrics", + }, + ) + r.reg(snode, DsortCreationRespCount, KindCounter, + &Extra{ + Help: "dsort: see https://github.com/NVIDIA/aistore/blob/main/docs/dsort.md#metrics", + }, + ) + r.reg(snode, DsortCreationRespLatency, KindLatency, + &Extra{ + Help: "dsort: see https://github.com/NVIDIA/aistore/blob/main/docs/dsort.md#metrics", + }, + ) + r.reg(snode, DsortExtractShardDskCnt, KindCounter, + &Extra{ + Help: "dsort: see https://github.com/NVIDIA/aistore/blob/main/docs/dsort.md#metrics", + }, + ) + r.reg(snode, DsortExtractShardMemCnt, KindCounter, + &Extra{ + Help: "dsort: see https://github.com/NVIDIA/aistore/blob/main/docs/dsort.md#metrics", + }, + ) + r.reg(snode, DsortExtractShardSize, KindSize, + &Extra{ + Help: "dsort: see https://github.com/NVIDIA/aistore/blob/main/docs/dsort.md#metrics", + }, + ) // core - r.reg(snode, RemoteDeletedDelCount, KindCounter, nil) - r.reg(snode, LcacheCollisionCount, KindCounter, nil) - r.reg(snode, LcacheEvictedCount, KindCounter, nil) - r.reg(snode, LcacheFlushColdCount, KindCounter, nil) + r.reg(snode, LcacheCollisionCount, KindCounter, + &Extra{ + Help: "number of LOM cache collisions (core, internal)", + }, + ) + r.reg(snode, LcacheEvictedCount, KindCounter, + &Extra{ + Help: "number of LOM cache evictions (core, internal)", + }, + ) + r.reg(snode, LcacheFlushColdCount, KindCounter, + &Extra{ + Help: "number of times a LOM from cache was written to stable storage (core, internal)", + }, + ) } func (r *Trunner) RegDiskMetrics(snode *meta.Snode, disk string) {