Skip to content

Commit

Permalink
core: retry _slow_ keepalive upon DNS lookup failure, given
Browse files Browse the repository at this point in the history
* different control and pub hostnames

Signed-off-by: Alex Aizman <[email protected]>
  • Loading branch information
alex-aizman committed Sep 19, 2024
1 parent eb369d4 commit a8a5c1e
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 4 deletions.
17 changes: 17 additions & 0 deletions ais/htrun.go
Original file line number Diff line number Diff line change
Expand Up @@ -1993,6 +1993,23 @@ func (h *htrun) slowKalive(smap *smapX, htext htext, timeout time.Duration) (pid
if strings.Contains(res.err.Error(), ciePrefix) {
cos.ExitLog(res.err) // FATAL: cluster integrity error (cie)
}
//
// intermittent DNS? retry just once if confirmed && pub != control
//
if cos.IsErrDNSLookup(res.err) && primaryURL == smap.Primary.URL(cmn.NetIntraControl) {
debug.Assert(psi == smap.Primary)
if smap.Primary.PubNet.Hostname != smap.Primary.ControlNet.Hostname {
nlog.Warningln(h.si.String(), "=>", psi.StringEx(), "slow keepalive:", err)
primaryURL = smap.Primary.URL(cmn.NetPublic)
nlog.Warningln("retrying via pub addr:", primaryURL)

freeCR(res)
res = h.regTo(primaryURL, psi, timeout, nil, htext, true /*keepalive*/)
}
} else if s := res.err.Error(); strings.Contains(s, "lookup") || strings.Contains(s, "no such host") {
// DEBUG -- remove when tested -- DEBUG
nlog.Infof(">>> slow keepalive: %v (%T)", res.err, res.err)
}
status, err = res.status, res.err
}
freeCR(res)
Expand Down
8 changes: 6 additions & 2 deletions ais/kalive.go
Original file line number Diff line number Diff line change
Expand Up @@ -567,7 +567,11 @@ func (k *keepalive) do(smap *smapX, si *meta.Snode, config *cmn.Config) (stopped
}

debug.Assert(cpid == pid && cpid != si.ID(), pid+", "+cpid+", "+si.ID())
nlog.Warningf("%s => %s keepalive failed: %v(%d)", si, meta.Pname(pid), err, status)
if status != 0 {
nlog.Warningln(si.String(), "=>", meta.Pname(pid), "keepalive failed: [", err, status, "]")
} else {
nlog.Warningln(si.String(), "=>", meta.Pname(pid), "keepalive failed:", err)
}

//
// retry
Expand All @@ -584,7 +588,7 @@ func (k *keepalive) do(smap *smapX, si *meta.Snode, config *cmn.Config) (stopped
// and therefore not skipping keepalive req (compare with palive.retry)
i++
started := mono.NanoTime()
pid, status, err = k.k.sendKalive(nil, timeout, started, false)
pid, status, err = k.k.sendKalive(nil, timeout, started, false /*fast*/)
if pid == si.ID() {
return // elected as primary
}
Expand Down
4 changes: 2 additions & 2 deletions cmn/cos/err.go
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ func IsErrOOS(err error) bool {
return errors.Is(err, syscall.ENOSPC)
}

func isErrDNSLookup(err error) bool {
func IsErrDNSLookup(err error) bool {
if _, ok := err.(*net.DNSError); ok {
return ok
}
Expand All @@ -178,7 +178,7 @@ func isErrDNSLookup(err error) bool {

func IsUnreachable(err error, status int) bool {
return IsErrConnectionRefused(err) ||
isErrDNSLookup(err) ||
IsErrDNSLookup(err) ||
errors.Is(err, context.DeadlineExceeded) ||
status == http.StatusRequestTimeout ||
status == http.StatusServiceUnavailable ||
Expand Down

0 comments on commit a8a5c1e

Please sign in to comment.