diff --git a/ais/htrun.go b/ais/htrun.go index 1bf72a77cc..159d403dc4 100644 --- a/ais/htrun.go +++ b/ais/htrun.go @@ -1993,6 +1993,23 @@ func (h *htrun) slowKalive(smap *smapX, htext htext, timeout time.Duration) (pid if strings.Contains(res.err.Error(), ciePrefix) { cos.ExitLog(res.err) // FATAL: cluster integrity error (cie) } + // + // intermittent DNS? retry just once if confirmed && pub != control + // + if cos.IsErrDNSLookup(res.err) && primaryURL == smap.Primary.URL(cmn.NetIntraControl) { + debug.Assert(psi == smap.Primary) + if smap.Primary.PubNet.Hostname != smap.Primary.ControlNet.Hostname { + nlog.Warningln(h.si.String(), "=>", psi.StringEx(), "slow keepalive:", err) + primaryURL = smap.Primary.URL(cmn.NetPublic) + nlog.Warningln("retrying via pub addr:", primaryURL) + + freeCR(res) + res = h.regTo(primaryURL, psi, timeout, nil, htext, true /*keepalive*/) + } + } else if s := res.err.Error(); strings.Contains(s, "lookup") || strings.Contains(s, "no such host") { + // DEBUG -- remove when tested -- DEBUG + nlog.Infof(">>> slow keepalive: %v (%T)", res.err, res.err) + } status, err = res.status, res.err } freeCR(res) diff --git a/ais/kalive.go b/ais/kalive.go index 950d3c1268..22bac9e2b0 100644 --- a/ais/kalive.go +++ b/ais/kalive.go @@ -567,7 +567,11 @@ func (k *keepalive) do(smap *smapX, si *meta.Snode, config *cmn.Config) (stopped } debug.Assert(cpid == pid && cpid != si.ID(), pid+", "+cpid+", "+si.ID()) - nlog.Warningf("%s => %s keepalive failed: %v(%d)", si, meta.Pname(pid), err, status) + if status != 0 { + nlog.Warningln(si.String(), "=>", meta.Pname(pid), "keepalive failed: [", err, status, "]") + } else { + nlog.Warningln(si.String(), "=>", meta.Pname(pid), "keepalive failed:", err) + } // // retry @@ -584,7 +588,7 @@ func (k *keepalive) do(smap *smapX, si *meta.Snode, config *cmn.Config) (stopped // and therefore not skipping keepalive req (compare with palive.retry) i++ started := mono.NanoTime() - pid, status, err = k.k.sendKalive(nil, timeout, started, false) + pid, status, err = k.k.sendKalive(nil, timeout, started, false /*fast*/) if pid == si.ID() { return // elected as primary } diff --git a/cmn/cos/err.go b/cmn/cos/err.go index d5932801a2..d1fdc835bf 100644 --- a/cmn/cos/err.go +++ b/cmn/cos/err.go @@ -168,7 +168,7 @@ func IsErrOOS(err error) bool { return errors.Is(err, syscall.ENOSPC) } -func isErrDNSLookup(err error) bool { +func IsErrDNSLookup(err error) bool { if _, ok := err.(*net.DNSError); ok { return ok } @@ -178,7 +178,7 @@ func isErrDNSLookup(err error) bool { func IsUnreachable(err error, status int) bool { return IsErrConnectionRefused(err) || - isErrDNSLookup(err) || + IsErrDNSLookup(err) || errors.Is(err, context.DeadlineExceeded) || status == http.StatusRequestTimeout || status == http.StatusServiceUnavailable ||