Skip to content

Commit

Permalink
(new) keep-alive error counter & keep-alive alert
Browse files Browse the repository at this point in the history
* add keep-alive error counter and alert, respectively:
  - "err.kalive.n"
  - "keep-alive-errors"
* keep-alive alert clears after 5 minutes (default) of no-change
* separately, bump minor versions:
  - aisloader, cli, authn

Signed-off-by: Alex Aizman <[email protected]>
  • Loading branch information
alex-aizman committed Sep 20, 2024
1 parent 9c961d5 commit 4e027cf
Show file tree
Hide file tree
Showing 8 changed files with 59 additions and 17 deletions.
13 changes: 9 additions & 4 deletions ais/kalive.go
Original file line number Diff line number Diff line change
Expand Up @@ -284,8 +284,10 @@ func (pkr *palive) updateSmap(config *cmn.Config) (stopped bool) {
continue
}
// otherwise, go keepalive with retries

pkr.statsT.IncErr(stats.ErrKaliveCount)
wg.Add(1)
go pkr.ping(si, wg, smap, config)
go pkr.goping(si, wg, smap, config)
}
}
wg.Wait()
Expand All @@ -308,7 +310,7 @@ func (pkr *palive) updateSmap(config *cmn.Config) (stopped bool) {
return
}

func (pkr *palive) ping(si *meta.Snode, wg cos.WG, smap *smapX, config *cmn.Config) {
func (pkr *palive) goping(si *meta.Snode, wg cos.WG, smap *smapX, config *cmn.Config) {
if len(pkr.stoppedCh) > 0 {
wg.Done()
return
Expand Down Expand Up @@ -338,6 +340,7 @@ func (pkr *palive) _pingRetry(si *meta.Snode, smap *smapX, config *cmn.Config) (

tout := config.Timeout.MaxKeepalive.String()
nlog.Warningln("failed to slow-ping", si.StringEx(), "- retrying [", err, status, tout, smap.StringEx(), "]")
pkr.statsT.IncErr(stats.ErrKaliveCount)

ticker := time.NewTicker(cmn.KeepaliveRetryDuration(config))
ok, stopped = pkr.retry(si, ticker, config.Timeout.MaxKeepalive.D())
Expand Down Expand Up @@ -432,6 +435,7 @@ func (pkr *palive) retry(si *meta.Snode, ticker *time.Ticker, timeout time.Durat
return true, false
}

pkr.statsT.IncErr(stats.ErrKaliveCount)
i++

if i >= kaNumRetries {
Expand Down Expand Up @@ -559,12 +563,11 @@ func (k *keepalive) do(smap *smapX, si *meta.Snode, config *cmn.Config) (stopped
pid = smap.Primary.ID()
timeout = config.Timeout.CplaneOperation.D()
started = mono.NanoTime()
fast bool
)
if nlog.Stopping() {
return
}
fast = k.k.cluUptime(started) > max(k.interval<<2, config.Timeout.Startup.D()>>1)
fast := k.k.cluUptime(started) > max(k.interval<<2, config.Timeout.Startup.D()>>1)
cpid, status, err := k.k.sendKalive(smap, timeout, started, fast)
if err == nil {
now := mono.NanoTime()
Expand All @@ -573,6 +576,8 @@ func (k *keepalive) do(smap *smapX, si *meta.Snode, config *cmn.Config) (stopped
return
}

k.statsT.IncErr(stats.ErrKaliveCount)

debug.Assert(cpid == pid && cpid != si.ID(), pid+", "+cpid+", "+si.ID())
if status != 0 {
nlog.Warningln(si.String(), "=>", meta.Pname(pid), "keepalive failed: [", err, status, "]")
Expand Down
4 changes: 4 additions & 0 deletions ais/metasync.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ import (
jsoniter "github.com/json-iterator/go"
)

//
// [TODO] retry to pub-addr, if different (currently, cmn.NetIntraControl only; see palive.retry)
//

// Metasync provides two methods to the rest of the `ais` code:
// * sync - to synchronize cluster-level metadata (the main method)
// * becomeNonPrimary - to be called when the current primary becomes non-primary
Expand Down
3 changes: 3 additions & 0 deletions cmd/cli/cli/performance.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ import (
"github.com/urfave/cli"
)

// TODO: target only - won't show put/get etc. counters and error counters (e.g. keep-alive) from proxies' perspective

type (
perfcb func(c *cli.Context,
metrics cos.StrKVs, mapBegin, mapEnd teb.StstMap, elapsed time.Duration) bool
Expand Down Expand Up @@ -352,6 +354,7 @@ func showPerfTab(c *cli.Context, metrics cos.StrKVs, cb perfcb, tag string, tota
}
}

// TODO: target only - won't show proxies' put/get etc. counters and error counters (e.g. keep-alive)
smap, tstatusMap, _, err := fillNodeStatusMap(c, apc.Target)
if err != nil {
return err
Expand Down
2 changes: 1 addition & 1 deletion cmd/cli/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ module github.com/NVIDIA/aistore/cmd/cli
go 1.22.3

require (
github.com/NVIDIA/aistore v1.3.24-0.20240919133121-550cade20b27
github.com/NVIDIA/aistore v1.3.24-0.20240920182211-0795fcc51822
github.com/fatih/color v1.17.0
github.com/json-iterator/go v1.1.12
github.com/onsi/ginkgo/v2 v2.20.0
Expand Down
4 changes: 2 additions & 2 deletions cmd/cli/go.sum
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
code.cloudfoundry.org/bytefmt v0.0.0-20190710193110-1eb035ffe2b6/go.mod h1:wN/zk7mhREp/oviagqUXY3EwuHhWyOvAdsn5Y4CzOrc=
github.com/BurntSushi/toml v1.3.2/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ=
github.com/NVIDIA/aistore v1.3.24-0.20240919133121-550cade20b27 h1:Thit15k9E0M+BvkYaHzPl53h9xoLjSHI1YIY6Q98dbk=
github.com/NVIDIA/aistore v1.3.24-0.20240919133121-550cade20b27/go.mod h1:si83S9r29vwIC0f0CE2Mk+25bFiaN6mmVlmuBpP4hHM=
github.com/NVIDIA/aistore v1.3.24-0.20240920182211-0795fcc51822 h1:1APUMPecxcDtb2njGP+SJpbkkJQo1xDpCp4VW7WAfgY=
github.com/NVIDIA/aistore v1.3.24-0.20240920182211-0795fcc51822/go.mod h1:si83S9r29vwIC0f0CE2Mk+25bFiaN6mmVlmuBpP4hHM=
github.com/OneOfOne/xxhash v1.2.8 h1:31czK/TI9sNkxIKfaUfGlU47BAxQ0ztGgd9vPyqimf8=
github.com/OneOfOne/xxhash v1.2.8/go.mod h1:eZbhyaAYD41SGSSsnmcpxVoRiQ/MPUTjUdIIOT9Um7Q=
github.com/VividCortex/ewma v1.1.1/go.mod h1:2Tkkvm3sRDVXaiyucHiACn4cqf7DpdyLvmxzcbUokwA=
Expand Down
11 changes: 8 additions & 3 deletions cmn/cos/node_state.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@ package cos

import (
"fmt"
"os"

"github.com/NVIDIA/aistore/cmn/debug"
"github.com/NVIDIA/aistore/cmn/nlog"
)

type NodeStateFlags BitFlags
Expand All @@ -35,6 +35,7 @@ const (
CertWillSoonExpire // warning X.509
CertificateExpired // red --/--
CertificateInvalid // red --/--
KeepAliveErrors // warning (errors during the last ~1m)
)

func (f NodeStateFlags) IsOK() bool { return f == NodeStarted|ClusterStarted }
Expand Down Expand Up @@ -129,12 +130,16 @@ func (f NodeStateFlags) String() string {
if f&CertificateInvalid == CertificateInvalid {
sb = append(sb, "tls-cert-invalid")
}
if f&KeepAliveErrors == KeepAliveErrors {
sb = append(sb, "keep-alive-errors")
}

l := len(sb)
switch l {
case 0:
err := fmt.Errorf("unknown flag %b", int64(f))
nlog.Errorln(err)
v := int64(f)
err := fmt.Errorf("node state alerts: unknown flag %x (%b)", v, v)
fmt.Fprintln(os.Stderr, err)
debug.Assert(false, err)
return "-" // (teb.unknownVal)
case 1:
Expand Down
6 changes: 3 additions & 3 deletions cmn/ver_const.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@ const GitHubHome = "https://github.com/NVIDIA/aistore"

const (
VersionAIStore = "3.24.rc4"
VersionCLI = "1.12"
VersionLoader = "1.11"
VersionAuthN = "1.0"
VersionCLI = "1.13"
VersionLoader = "1.12"
VersionAuthN = "1.1"
)

const (
Expand Down
33 changes: 29 additions & 4 deletions stats/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ import (

// defaults and tunables
const (
dfltKaliveClearAlert = 5 * time.Minute // clear `cos.KeepAliveErrors` alert when `ErrKaliveCount` doesn't inc that much time
dfltPeriodicFlushTime = time.Minute // when `config.Log.FlushTime` is 0 (zero)
dfltPeriodicTimeStamp = time.Hour // extended date/time complementary to log timestamps (e.g., "11:29:11.644596")
maxStatsLogInterval = int64(3 * time.Minute) // when idle; secondly, an upper limit on `config.Log.StatsTime`
Expand Down Expand Up @@ -85,6 +86,8 @@ const (
ErrRenameCount = errPrefix + RenameCount
ErrListCount = errPrefix + ListCount

ErrKaliveCount = errPrefix + "kalive.n"

// more errors
// (for even more errors, see target_stats)
ErrHTTPWriteCount = errPrefix + "http.write.n"
Expand Down Expand Up @@ -229,6 +232,11 @@ func (r *runner) regCommon(snode *meta.Snode) {
Help: "total number of list-objects errors",
},
)
r.reg(snode, ErrKaliveCount, KindCounter,
&Extra{
Help: "total number of keep-alive failures",
},
)

// even more error counters
r.reg(snode, ErrHTTPWriteCount, KindCounter,
Expand Down Expand Up @@ -418,36 +426,53 @@ waitStartup:

var (
lastNgr int64
lastKaliveErrInc int64
kaliveErrs int64
startTime = mono.NanoTime() // uptime henceforth
lastDateTimestamp = startTime
lastDateTimestamp = startTime // RFC822
)
for {
select {
case <-r.ticker.C:
now := mono.NanoTime()
config = cmn.GCO.Get()
logger.log(now, time.Duration(now-startTime) /*uptime*/, config)

// 1. "High number of"
lastNgr = r.checkNgr(now, lastNgr, goMaxProcs)

if statsTime != config.Periodic.StatsTime.D() {
statsTime = config.Periodic.StatsTime.D()
r.ticker.Reset(statsTime)
logger.statsTime(statsTime)
}
//
// NOTE: stats runner is solely responsible to flush logs
//

// 2. flush logs (NOTE: stats runner is solely responsible)
flushTime := dfltPeriodicFlushTime
if config.Log.FlushTime != 0 {
flushTime = config.Log.FlushTime.D()
}
if nlog.Since(now) > flushTime || nlog.OOB() {
nlog.Flush(nlog.ActNone)
}

// 3. dated time => info log
if time.Duration(now-lastDateTimestamp) > dfltPeriodicTimeStamp {
nlog.Infoln(cos.FormatTime(time.Now(), "" /* RFC822 */) + " =============")
lastDateTimestamp = now
}

// 4. kalive alert
n := r.Get(ErrKaliveCount)
if n != kaliveErrs {
// raise
lastKaliveErrInc = now
r.SetFlag(NodeAlerts, cos.KeepAliveErrors)
kaliveErrs = n
} else if time.Duration(now-lastKaliveErrInc) >= dfltKaliveClearAlert {
// clear
r.ClrFlag(NodeAlerts, cos.KeepAliveErrors)
}
case <-r.stopCh:
r.ticker.Stop()
return nil
Expand Down

0 comments on commit 4e027cf

Please sign in to comment.