diff --git a/ais/htcommon.go b/ais/htcommon.go index b216875073..c3aa548264 100644 --- a/ais/htcommon.go +++ b/ais/htcommon.go @@ -726,7 +726,7 @@ func (smap *smapX) fill(nsti *cos.NodeStateInfo) { func (c *getMaxCii) do(si *meta.Snode, wg cos.WG, smap *smapX) { var nsti *cos.NodeStateInfo - body, _, err := c.h.reqHealth(si, c.timeout, c.query, smap) + body, _, err := c.h.reqHealth(si, c.timeout, c.query, smap, false /*retry pub-addr*/) if err != nil { goto ret } diff --git a/ais/htrun.go b/ais/htrun.go index dd87a9d92d..214fcc9bfd 100644 --- a/ais/htrun.go +++ b/ais/htrun.go @@ -1413,7 +1413,7 @@ func (h *htrun) isValidObjname(w http.ResponseWriter, r *http.Request, name stri } // health client -func (h *htrun) reqHealth(si *meta.Snode, tout time.Duration, q url.Values, smap *smapX, retry ...int) ([]byte, int, error) { +func (h *htrun) reqHealth(si *meta.Snode, tout time.Duration, q url.Values, smap *smapX, retry bool) ([]byte, int, error) { var ( path = apc.URLPathHealth.S url = si.URL(cmn.NetIntraControl) @@ -1428,10 +1428,10 @@ func (h *htrun) reqHealth(si *meta.Snode, tout time.Duration, q url.Values, smap b, status, err := res.bytes, res.status, res.err freeCR(res) - if err != nil && len(retry) > 0 { + if err != nil && retry { // [NOTE] retrying when: - // - about to remove the node from the cluster map, or - // - about to elect new primary + // - about to remove node 'si' from the cluster map, or + // - about to elect a new primary; // not checking `IsErrDNSLookup` and similar - ie., not trying to narrow down // (compare w/ slow-keepalive) if si.PubNet.Hostname != si.ControlNet.Hostname { @@ -2077,7 +2077,7 @@ func (h *htrun) pollClusterStarted(config *cmn.Config, psi *meta.Snode) (maxNsti nlog.Warningln(h.String(), "started as a non-primary and got _elected_ during startup") return } - if _, _, err := h.reqHealth(smap.Primary, healthTimeout, query /*ask primary*/, smap); err == nil { + if _, _, err := h.reqHealth(smap.Primary, healthTimeout, query /*ask primary*/, smap, false /*retry pub-addr*/); err == nil { // log s := fmt.Sprintf("%s via primary health: cluster startup Ok, %s", h.si, smap.StringEx()) if self := smap.GetNode(h.si.ID()); self == nil { diff --git a/ais/kalive.go b/ais/kalive.go index 4f0a99665d..48b6d2b4b5 100644 --- a/ais/kalive.go +++ b/ais/kalive.go @@ -233,7 +233,7 @@ func (pkr *palive) do(config *cmn.Config) (stopped bool) { } if smap.isPrimary(pkr.p.si) { if !pkr.inProgress.CAS(false, true) { - nlog.Infoln(pkr.p.String() + ": primary keepalive in progress") + nlog.Warningln(pkr.p.String(), "primary keepalive in progress") // NOTE: see wg.Wait() below return } stopped = pkr.updateSmap(config) @@ -249,8 +249,8 @@ func (pkr *palive) do(config *cmn.Config) (stopped bool) { return } -// updateSmap pings all nodes in parallel. Non-responding nodes get removed from the Smap and -// the resulting map is then metasync-ed. +// keep-alive nodes in parallel; nodes that fail to respond get removed from the cluster map (Smap) +// (see 'maintenance-mode' comment below) func (pkr *palive) updateSmap(config *cmn.Config) (stopped bool) { var ( p = pkr.p @@ -277,7 +277,7 @@ func (pkr *palive) updateSmap(config *cmn.Config) (stopped bool) { // direct call first started := mono.NanoTime() - if _, _, err := pkr.p.reqHealth(si, config.Timeout.CplaneOperation.D(), nil, smap); err == nil { + if _, _, err := pkr.p.reqHealth(si, config.Timeout.CplaneOperation.D(), nil, smap, false /*retry pub-addr*/); err == nil { now := mono.NanoTime() pkr.statsT.Add(stats.KeepAliveLatency, now-started) pkr.hb.HeardFrom(si.ID(), now) // effectively, yes @@ -310,6 +310,7 @@ func (pkr *palive) updateSmap(config *cmn.Config) (stopped bool) { return } +// "slow-ping" func (pkr *palive) goping(si *meta.Snode, wg cos.WG, smap *smapX, config *cmn.Config) { if len(pkr.stoppedCh) > 0 { wg.Done() @@ -327,10 +328,10 @@ func (pkr *palive) goping(si *meta.Snode, wg cos.WG, smap *smapX, config *cmn.Co func (pkr *palive) _pingRetry(si *meta.Snode, smap *smapX, config *cmn.Config) (ok, stopped bool) { var ( - timeout = config.Timeout.CplaneOperation.D() + tout = config.Timeout.CplaneOperation.D() started = mono.NanoTime() ) - _, status, err := pkr.p.reqHealth(si, timeout, nil, smap) + _, status, err := pkr.p.reqHealth(si, tout, nil, smap, true /*retry via pub-addr, if different*/) if err == nil { now := mono.NanoTime() pkr.statsT.Add(stats.KeepAliveLatency, now-started) @@ -338,12 +339,12 @@ func (pkr *palive) _pingRetry(si *meta.Snode, smap *smapX, config *cmn.Config) ( return true, false } - tout := config.Timeout.MaxKeepalive.String() + tout = config.Timeout.MaxKeepalive.D() nlog.Warningln("failed to slow-ping", si.StringEx(), "- retrying [", err, status, tout, smap.StringEx(), "]") pkr.statsT.IncErr(stats.ErrKaliveCount) ticker := time.NewTicker(cmn.KeepaliveRetryDuration(config)) - ok, stopped = pkr.retry(si, ticker, config.Timeout.MaxKeepalive.D()) + ok, stopped = pkr.retry(si, ticker, tout) ticker.Stop() return ok, stopped @@ -411,7 +412,7 @@ func (pkr *palive) _final(ctx *smapModifier, clone *smapX) { _ = pkr.p.metasyncer.sync(revsPair{clone, msg}) } -func (pkr *palive) retry(si *meta.Snode, ticker *time.Ticker, timeout time.Duration) (ok, stopped bool) { +func (pkr *palive) retry(si *meta.Snode, ticker *time.Ticker, tout time.Duration) (ok, stopped bool) { var i int for { if !pkr.timeToPing(si.ID()) { @@ -426,8 +427,7 @@ func (pkr *palive) retry(si *meta.Snode, ticker *time.Ticker, timeout time.Durat started = mono.NanoTime() smap = pkr.p.owner.smap.get() ) - // retry via pub-addr if different (compare with slowKalive) - _, status, err := pkr.p.reqHealth(si, timeout, nil, smap, 1 /*retry*/) + _, status, err := pkr.p.reqHealth(si, tout, nil, smap, true /*retry via pub-addr, if different*/) if err == nil { now := mono.NanoTime() pkr.statsT.Add(stats.KeepAliveLatency, now-started) @@ -561,14 +561,16 @@ func (k *keepalive) configUpdate(cfg *cmn.KeepaliveTrackerConf) { func (k *keepalive) do(smap *smapX, si *meta.Snode, config *cmn.Config) (stopped bool) { var ( pid = smap.Primary.ID() - timeout = config.Timeout.CplaneOperation.D() + pname = meta.Pname(pid) + tout = config.Timeout.CplaneOperation.D() started = mono.NanoTime() + sname = si.String() ) if nlog.Stopping() { return } fast := k.k.cluUptime(started) > max(k.interval<<2, config.Timeout.Startup.D()>>1) - cpid, status, err := k.k.sendKalive(smap, timeout, started, fast) + cpid, status, err := k.k.sendKalive(smap, tout, started, fast) if err == nil { now := mono.NanoTime() k.statsT.Add(stats.KeepAliveLatency, now-started) @@ -580,9 +582,9 @@ func (k *keepalive) do(smap *smapX, si *meta.Snode, config *cmn.Config) (stopped debug.Assert(cpid == pid && cpid != si.ID(), pid+", "+cpid+", "+si.ID()) if status != 0 { - nlog.Warningln(si.String(), "=>", meta.Pname(pid), "keepalive failed: [", err, status, "]") + nlog.Warningln(sname, "=>", pname, "keepalive failed: [", err, status, "]") } else { - nlog.Warningln(si.String(), "=>", meta.Pname(pid), "keepalive failed:", err) + nlog.Warningln(sname, "=>", pname, "keepalive failed:", err) } // @@ -600,22 +602,27 @@ func (k *keepalive) do(smap *smapX, si *meta.Snode, config *cmn.Config) (stopped // and therefore not skipping keepalive req (compare with palive.retry) i++ started := mono.NanoTime() - pid, status, err = k.k.sendKalive(nil, timeout, started, false /*fast*/) + pid, status, err = k.k.sendKalive(nil, tout, started, false /*fast*/) if pid == si.ID() { return // elected as primary } + pname = meta.Pname(pid) if err == nil { now := mono.NanoTime() k.statsT.Add(stats.KeepAliveLatency, now-started) k.hb.HeardFrom(pid, now) // effectively, yes - nlog.Infof("%s: OK after %d attempt%s", si, i, cos.Plural(i)) + if i == 1 { + nlog.Infoln(sname, "=>", pname, "OK after 1 attempt") + } else { + nlog.Infoln(sname, "=>", pname, "OK after", i, "attempts") + } return } - // repeat up to `kaNumRetries` with the max timeout - timeout = config.Timeout.MaxKeepalive.D() + // repeat up to `kaNumRetries` times with max-keepalive timeout + tout = config.Timeout.MaxKeepalive.D() if i == kaNumRetries { - nlog.Warningf("%s: failed %d attempts => %s (primary)", si, i, meta.Pname(pid)) + nlog.Warningln(sname, "=>", pname, "failed after", i, "attempts") return true } if cos.IsUnreachable(err, status) { @@ -624,7 +631,7 @@ func (k *keepalive) do(smap *smapX, si *meta.Snode, config *cmn.Config) (stopped if nlog.Stopping() { return true } - err = fmt.Errorf("%s: unexpected response from %s: %v(%d)", si, meta.Pname(pid), err, status) + err = fmt.Errorf("%s: unexpected response from %s: %v(%d)", sname, pname, err, status) debug.AssertNoErr(err) nlog.Warningln(err) case sig := <-k.controlCh: diff --git a/ais/prxclu.go b/ais/prxclu.go index b2098102e2..d16467744e 100644 --- a/ais/prxclu.go +++ b/ais/prxclu.go @@ -1597,6 +1597,7 @@ func (p *proxy) _rebPostRm(ctx *smapModifier, clone *smapX) { } func (p *proxy) stopMaintenance(w http.ResponseWriter, r *http.Request, msg *apc.ActMsg) { + const tag = "stop-maintenance:" var ( opts apc.ActValRmNode smap = p.owner.smap.get() @@ -1612,31 +1613,41 @@ func (p *proxy) stopMaintenance(w http.ResponseWriter, r *http.Request, msg *apc return } - nlog.Infof("%s: %s(%s) opts=%v", p, msg.Action, si.StringEx(), opts) + sname := si.StringEx() + pname := p.String() + nlog.Infoln(tag, pname, "[", msg.Action, sname, opts, "]") if !smap.InMaint(si) { p.writeErrf(w, r, "node %s is not in maintenance mode - nothing to do", si.StringEx()) return } - timeout := cmn.GCO.Get().Timeout.CplaneOperation.D() - if _, status, err := p.reqHealth(si, timeout, nil, smap); err != nil { - sleep, retries := timeout/2, 5 + tout := cmn.Rom.CplaneOperation() + if _, status, err := p.reqHealth(si, tout, nil, smap, false /*retry pub-addr*/); err != nil { + // TODO -- FIXME: use cmn.KeepaliveRetryDuration() + sleep, retries := tout/2, 4 + time.Sleep(sleep) - for range retries { // retry + for i := range retries { // retry time.Sleep(sleep) - _, status, err = p.reqHealth(si, timeout, nil, smap) + _, status, err = p.reqHealth(si, tout, nil, smap, true /*retry pub-addr*/) if err == nil { + if i == 1 { + nlog.Infoln(tag, pname, "=>", sname, "OK after 1 attempt [", msg.Action, opts, "]") + } else { + nlog.Infoln(tag, pname, "=>", sname, "OK after", i, "attempts [", msg.Action, opts, "]") + } break } if status != http.StatusServiceUnavailable { - p.writeErrf(w, r, "%s is unreachable: %v(%d)", si, err, status) + p.writeErrf(w, r, "%s is unreachable: %v(%d)", sname, err, status) return } + sleep = min(sleep+time.Second, tout) } if err != nil { debug.Assert(status == http.StatusServiceUnavailable) nlog.Errorf("%s: node %s takes unusually long time to start: %v(%d) - proceeding anyway", - p.si, si, err, status) + pname, sname, err, status) } } diff --git a/ais/tgtimpl.go b/ais/tgtimpl.go index 85ef252a0b..4e384cf6d3 100644 --- a/ais/tgtimpl.go +++ b/ais/tgtimpl.go @@ -34,7 +34,7 @@ func (*target) GetAllRunning(inout *core.AllRunningInOut, periodic bool) { } func (t *target) Health(si *meta.Snode, timeout time.Duration, query url.Values) ([]byte, int, error) { - return t.reqHealth(si, timeout, query, t.owner.smap.get()) + return t.reqHealth(si, timeout, query, t.owner.smap.get(), false /*retry*/) } func (t *target) Backend(bck *meta.Bck) core.Backend { diff --git a/ais/vote.go b/ais/vote.go index f08f74a370..f0522958f8 100644 --- a/ais/vote.go +++ b/ais/vote.go @@ -228,8 +228,7 @@ func (p *proxy) elect(vr *VoteRecord, xele *xs.Election) { curName = curPrimary.StringEx() pname = p.String() pnameC = pname + ":" - config = cmn.GCO.Get() - timeout = config.Timeout.CplaneOperation.D() / 2 + tout = cmn.Rom.CplaneOperation() ) // 1. ping the current primary (not using `apc.QparamAskPrimary` as the latter might be transitioning) for i := range retryCurPrimary { @@ -243,19 +242,16 @@ func (p *proxy) elect(vr *VoteRecord, xele *xs.Election) { return } - // retry via pub-addr if different (compare with palive.retry) - _, _, err = p.reqHealth(curPrimary, timeout, nil /*ask primary*/, smap, 1 /*retry*/) + _, _, err = p.reqHealth(curPrimary, tout, nil /*ask primary*/, smap, true /*retry via pub-addr, if different*/) if err == nil { break } - timeout = config.Timeout.CplaneOperation.D() } if err == nil { // move back to idle query := url.Values{apc.QparamAskPrimary: []string{"true"}} - // retry via pub-addr if different (compare with palive.retry) - _, _, err = p.reqHealth(curPrimary, timeout, query /*ask primary*/, smap, 1 /*retry*/) + _, _, err = p.reqHealth(curPrimary, tout, query /*ask primary*/, smap, true /* ditto */) if err == nil { nlog.Infoln(pnameC, "the current primary", curName, "is up, moving back to idle") diff --git a/cmd/cli/cli/log_hdlr.go b/cmd/cli/cli/log_hdlr.go index b4e33f8c50..3148a9fe59 100644 --- a/cmd/cli/cli/log_hdlr.go +++ b/cmd/cli/cli/log_hdlr.go @@ -25,14 +25,15 @@ import ( const clusterCompletion = "cluster" // (compare with getCluLogUsage) -const getLogUsage = "download the current log or entire log history from a selected node or all nodes, e.g.:\n" + - indent4 + "\t - 'ais log get NODE_ID /tmp' - download the specified node's current log; save the result to the specified directory;\n" + - indent4 + "\t - 'ais log get NODE_ID /tmp/out --refresh 10' - download the current log as /tmp/out\n" + - indent4 + "\t keep updating (ie., appending) the latter every 10s;\n" + - indent4 + "\t - 'ais log get cluster /tmp' - download TAR.GZ archived logs from _all_ nodes in the cluster\n" + - indent4 + "\t (note that 'cluster' implies '--all'), and save the result to the specified destination;\n" + - indent4 + "\t - 'ais log get NODE_ID --all' - download the node's TAR.GZ log archive\n" + - indent4 + "\t - 'ais log get NODE_ID --all --severity e' - TAR.GZ archive of (only) logged errors and warnings" +const getLogUsage = "download the current log or entire log history from: a) selected node, or b) entire cluster,\n" + + indent1 + "\t e.g.:\n" + + indent1 + "\t - 'ais log get NODE_ID /tmp'\t- download the specified node's current log and save it in the specified directory;\n" + + indent1 + "\t - 'ais log get NODE_ID /tmp/out --refresh 10'\t- download the node's current log _as_ /tmp/out\n" + + indent1 + "\t \t and keep updating (ie., appending) the latter every 10s;\n" + + indent1 + "\t - 'ais log get cluster /tmp'\t-\tdownload TAR.GZ archived logs of _all_ nodes in the cluster\n" + + indent1 + "\t \t and save the result in the specified local directory (note that 'get cluster' implies '--all');\n" + + indent1 + "\t - 'ais log get NODE_ID --all'\t- given 'NODE-ID' download the node's entire log TAR.GZ archive\n" + + indent1 + "\t - 'ais log get NODE_ID --all --severity e'\t- archive logged errors and warnings" var ( nodeLogFlags = map[string][]cli.Flag{ diff --git a/cmn/config.go b/cmn/config.go index baa42a7f37..db643a2e17 100644 --- a/cmn/config.go +++ b/cmn/config.go @@ -1216,7 +1216,7 @@ func (c *KeepaliveConf) Validate() (err error) { func KeepaliveRetryDuration(c *Config) time.Duration { d := c.Timeout.CplaneOperation.D() * time.Duration(c.Keepalive.RetryFactor) - return min(d, c.Timeout.MaxKeepalive.D()+time.Second/2) + return min(d, c.Timeout.MaxKeepalive.D()+time.Second) } /////////////