Skip to content

Commit

Permalink
tls cert (re)loader: raise/clear alerts; follow-up
Browse files Browse the repository at this point in the history
* part six, prev. commit: 7c338a2

Signed-off-by: Alex Aizman <[email protected]>
  • Loading branch information
alex-aizman committed Aug 30, 2024
1 parent fbb2062 commit 1fe3c80
Show file tree
Hide file tree
Showing 12 changed files with 62 additions and 30 deletions.
10 changes: 5 additions & 5 deletions ais/htrun.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ func (h *htrun) ByteMM() *memsys.MMSA { return h.smm }
// NOTE: currently, only 'resume' (see also: kaSuspendMsg)
func (h *htrun) smapUpdatedCB(_, _ *smapX, nfl, ofl cos.BitFlags) {
if ofl.IsAnySet(meta.SnodeMaintDecomm) && !nfl.IsAnySet(meta.SnodeMaintDecomm) {
h.statsT.ClrFlag(stats.NodeStateFlags, cos.MaintenanceMode)
h.statsT.ClrFlag(stats.NodeAlerts, cos.MaintenanceMode)
h.keepalive.ctrl(kaResumeMsg)
}
}
Expand Down Expand Up @@ -194,14 +194,14 @@ func (h *htrun) ClusterStarted() bool { return h.startup.cluster.Load() > 0 } //

func (h *htrun) markClusterStarted() {
h.startup.cluster.Store(mono.NanoTime())
h.statsT.SetFlag(stats.NodeStateFlags, cos.ClusterStarted)
h.statsT.SetFlag(stats.NodeAlerts, cos.ClusterStarted)
}

func (h *htrun) NodeStarted() bool { return h.startup.node.Load() > 0 }

func (h *htrun) markNodeStarted() {
h.startup.node.Store(mono.NanoTime())
h.statsT.SetFlag(stats.NodeStateFlags, cos.NodeStarted)
h.statsT.SetFlag(stats.NodeAlerts, cos.NodeStarted)
}

func (h *htrun) regNetHandlers(networkHandlers []networkHandler) {
Expand Down Expand Up @@ -261,7 +261,7 @@ func (h *htrun) regNetHandlers(networkHandlers []networkHandler) {
func (h *htrun) init(config *cmn.Config) {
// before newTLS() below & before intra-cluster clients
if config.Net.HTTP.UseHTTPS {
if err := aistls.Init(config.Net.HTTP.Certificate, config.Net.HTTP.CertKey); err != nil {
if err := aistls.Init(config.Net.HTTP.Certificate, config.Net.HTTP.CertKey, h.statsT); err != nil {
cos.ExitLog(err)
}
}
Expand Down Expand Up @@ -1144,7 +1144,7 @@ func (h *htrun) statsAndStatus() (ds *stats.NodeStatus) {
Snode: h.si,
},
Cluster: cos.NodeStateInfo{
Flags: cos.NodeStateFlags(h.statsT.Get(stats.NodeStateFlags)),
Flags: cos.NodeStateFlags(h.statsT.Get(stats.NodeAlerts)),
},
SmapVersion: smap.Version,
MemCPUInfo: apc.GetMemCPU(),
Expand Down
2 changes: 1 addition & 1 deletion ais/target.go
Original file line number Diff line number Diff line change
Expand Up @@ -507,7 +507,7 @@ func (t *target) checkRestarted(config *cmn.Config) (fatalErr, writeErr error) {
fatalErr = fmt.Errorf("%s: %q is in use (duplicate or overlapping run?)", t, red.inUse)
return
}
t.statsT.SetFlag(stats.NodeStateFlags, cos.Restarted)
t.statsT.SetFlag(stats.NodeAlerts, cos.Restarted)
fs.PersistMarker(fname.NodeRestartedPrev)
}
fatalErr, writeErr = fs.PersistMarker(fname.NodeRestartedMarker)
Expand Down
4 changes: 2 additions & 2 deletions ais/tgtcp.go
Original file line number Diff line number Diff line change
Expand Up @@ -188,13 +188,13 @@ func (t *target) daeputMsg(w http.ResponseWriter, r *http.Request) {
if !t.ensureIntraControl(w, r, true /* from primary */) {
return
}
t.statsT.SetFlag(stats.NodeStateFlags, cos.MaintenanceMode)
t.statsT.SetFlag(stats.NodeAlerts, cos.MaintenanceMode)
t.termKaliveX(msg.Action, true)
case apc.ActShutdownCluster, apc.ActShutdownNode:
if !t.ensureIntraControl(w, r, true /* from primary */) {
return
}
t.statsT.SetFlag(stats.NodeStateFlags, cos.MaintenanceMode)
t.statsT.SetFlag(stats.NodeAlerts, cos.MaintenanceMode)
t.termKaliveX(msg.Action, false)
t.shutdown(msg.Action)
case apc.ActRmNodeUnsafe:
Expand Down
2 changes: 1 addition & 1 deletion ais/tgtfshc.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,6 @@ func (t *target) FSHC(err error, mi *fs.Mountpath, fqn string) {
func (t *target) DisableMpath(mi *fs.Mountpath) (err error) {
_, err = t.fsprg.disableMpath(mi.Path, true /*dont-resilver*/)

t.statsT.SetFlag(stats.NodeStateFlags, cos.DiskFault)
t.statsT.SetFlag(stats.NodeAlerts, cos.DiskFault)
return err
}
4 changes: 2 additions & 2 deletions ais/tgtspace.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,9 @@ func (t *target) OOS(csRefreshed *fs.CapStatus, config *cmn.Config, tcdf *fs.Tcd
}

if cs.IsOOS() {
t.statsT.SetFlag(stats.NodeStateFlags, cos.OOS)
t.statsT.SetFlag(stats.NodeAlerts, cos.OOS)
} else {
t.statsT.SetFlag(stats.NodeStateFlags, cos.LowCapacity)
t.statsT.SetFlag(stats.NodeAlerts, cos.LowCapacity)
}
nlog.Warningln(t.String(), "running store cleanup:", cs.String())
// run serially, cleanup first and LRU second, iff out-of-space persists
Expand Down
2 changes: 1 addition & 1 deletion cmd/cli/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ module github.com/NVIDIA/aistore/cmd/cli
go 1.22.3

require (
github.com/NVIDIA/aistore v1.3.24-0.20240829150638-5e92eff58c06
github.com/NVIDIA/aistore v1.3.24-0.20240829200923-53d4e43a34af
github.com/fatih/color v1.17.0
github.com/json-iterator/go v1.1.12
github.com/onsi/ginkgo/v2 v2.20.0
Expand Down
4 changes: 2 additions & 2 deletions cmd/cli/go.sum
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
code.cloudfoundry.org/bytefmt v0.0.0-20190710193110-1eb035ffe2b6/go.mod h1:wN/zk7mhREp/oviagqUXY3EwuHhWyOvAdsn5Y4CzOrc=
github.com/BurntSushi/toml v1.3.2/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ=
github.com/NVIDIA/aistore v1.3.24-0.20240829150638-5e92eff58c06 h1:ElvM2C2BNSoOURslPeaQz2+mspTrny4IVATVIn6BQjI=
github.com/NVIDIA/aistore v1.3.24-0.20240829150638-5e92eff58c06/go.mod h1:si83S9r29vwIC0f0CE2Mk+25bFiaN6mmVlmuBpP4hHM=
github.com/NVIDIA/aistore v1.3.24-0.20240829200923-53d4e43a34af h1:V4U4kWxVMZ+I4Vaicr3ockHD/gymoAEuKmxG+Kw8TQw=
github.com/NVIDIA/aistore v1.3.24-0.20240829200923-53d4e43a34af/go.mod h1:si83S9r29vwIC0f0CE2Mk+25bFiaN6mmVlmuBpP4hHM=
github.com/OneOfOne/xxhash v1.2.8 h1:31czK/TI9sNkxIKfaUfGlU47BAxQ0ztGgd9vPyqimf8=
github.com/OneOfOne/xxhash v1.2.8/go.mod h1:eZbhyaAYD41SGSSsnmcpxVoRiQ/MPUTjUdIIOT9Um7Q=
github.com/VividCortex/ewma v1.1.1/go.mod h1:2Tkkvm3sRDVXaiyucHiACn4cqf7DpdyLvmxzcbUokwA=
Expand Down
2 changes: 2 additions & 0 deletions cmn/cos/node_state.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ import (

type NodeStateFlags BitFlags

const NodeAlerts = "state.flags"

const (
VoteInProgress = NodeStateFlags(1 << iota) // warning
ClusterStarted // info: (primary: cluster-started | all other nodes: joined-cluster)
Expand Down
17 changes: 12 additions & 5 deletions cmn/tls/certloader.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ import (
"github.com/NVIDIA/aistore/hk"
)

// TODO: can be _expired_ with invalid (non-parseable) replacement - differentiate

const name = "certificate-loader"

type (
Expand All @@ -35,6 +37,7 @@ type (
xcert atomic.Pointer[xcert]
certFile string
keyFile string
tstats cos.StatsUpdater
}

GetCertCB func(_ *tls.ClientHelloInfo) (*tls.Certificate, error)
Expand All @@ -46,13 +49,13 @@ var (
)

// (htrun only)
func Init(certFile, keyFile string) (err error) {
func Init(certFile, keyFile string, tstats cos.StatsUpdater) (err error) {
if certFile == "" && keyFile == "" {
return nil
}

debug.Assert(loader == nil)
loader = &certLoader{certFile: certFile, keyFile: keyFile}
loader = &certLoader{certFile: certFile, keyFile: keyFile, tstats: tstats}
if err = loader.load(false /*compare*/); err != nil {
nlog.Errorln("FATAL:", err)
loader = nil
Expand Down Expand Up @@ -83,6 +86,7 @@ func (cl *certLoader) hktime() (d time.Duration) {
nlog.Errorln(cl.certFile, warn, rem)
d = min(10*time.Second, rem)
default: // expired
cl.tstats.SetFlag(cos.NodeAlerts, cos.CertificateExpired)
d = time.Hour
}
return d
Expand Down Expand Up @@ -147,6 +151,7 @@ func (cl *certLoader) load(compare bool) (err error) {
}

// 4. keep and log
cl.tstats.ClrFlag(cos.NodeAlerts, cos.CertificateExpired)
cl.xcert.Store(&xcert)
nlog.Infoln(xcert.String())

Expand Down Expand Up @@ -186,8 +191,10 @@ func (x *xcert) ini(finfo os.FileInfo) (err error) {
x.notAfter = x.Certificate.Leaf.NotAfter
}
now := time.Now()
if now.Before(x.notBefore) || now.After(x.notAfter) {
nlog.Errorln(x.parent.certFile, "X.509 is invalid - outside its certified time range [", x.notBefore, x.notAfter, "]")
if now.After(x.notAfter) {
err = fmt.Errorf("%s: X.509 %s expired (valid until %v)", name, x.parent.certFile, x.notAfter)
} else if now.Before(x.notBefore) {
nlog.Warningln(x.parent.certFile, "X.509 is not valid _yet_: [", x.notBefore, x.notAfter, "]")
}
return nil
return err
}
29 changes: 26 additions & 3 deletions docs/https.md
Original file line number Diff line number Diff line change
Expand Up @@ -137,14 +137,37 @@ As far as automatic adjustment of the interval, this depends on the remaining ti
| more than 1h | 10m |
| more than 10m | 1m |
| 10m or less | 10s |
| `expired` | 1h |

Upon initial loading, or every time when reloading, AIS node logs a record that also shows the validity bounds, e.g.:
Upon initial loading, or every time when reloading, an AIS node logs a record that also shows the validity bounds, e.g.:

```log
I 11:05:45.753438 certloader:151 server.crt[26 Aug 24 18:18 UTC, 26 Aug 25 18:18 UTC]
```

If and when the certificate expires, AIS node raises the namesake alert that (as usual) will show up via Grafana dashboard, CLI `show cluster` command, or both.
In addition, if certificate expires, AIS node raises the namesake alert that - as usual - will show up in Grafana dashboard, CLI `show cluster` command, or both.

```console
$ ais show cluster
PROXY MEM USED(%) MEM AVAIL LOAD AVERAGE UPTIME STATUS ALERT
p[atipJhgn][P] 0.17% 27.51GiB [0.3 0.1 0.0] - online **TLS-certificate-expired**

TARGET MEM USED(%) MEM AVAIL CAP USED(%) CAP AVAIL LOAD AVERAGE STATUS ALERT
t[NlLtPtrm] 0.16% 27.51GiB 16% 367.538GiB [0.3 0.1 0.0] online **TLS-certificate-expired**

Summary:
Proxies: 1
Targets: 1 (one disk)
Capacity: used 70.59GiB (16%), available 367.54GiB
Cluster Map: version 4, UUID A5yAiCsW7p, primary p[atipJhgn]
Software: 3.24.rc3.97255b97e (build: 2024-08-29T19:27:33-0400)
Deployment: dev
Status: 2 online
Rebalance: n/a
Authentication: disabled
Version: 3.24.rc3.97255b97e
Build: 2024-08-29T19:27:33-0400
```

Finally, to reload TLS cert at any given time, simply run:

Expand All @@ -164,7 +187,7 @@ $ ais advanced load-X.509 t[NlLtPtrm]
Done.
```

Note: if [AuthN](docs/authn.md) is deployed, the API (and CLI above) will require administrative permissions.
Note: if [AuthN](/docs/authn.md) is deployed, the API (and CLI above) will require administrative permissions.

## Switching cluster between HTTP and HTTPS

Expand Down
4 changes: 2 additions & 2 deletions reb/globrun.go
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ func (reb *Reb) RunRebalance(smap *meta.Smap, id int64, notif *xact.NotifXact, t

onGFN()

tstats.SetFlag(stats.NodeStateFlags, cos.Rebalancing)
tstats.SetFlag(stats.NodeAlerts, cos.Rebalancing)

errCnt := 0
err := reb.run(rargs)
Expand All @@ -240,7 +240,7 @@ func (reb *Reb) RunRebalance(smap *meta.Smap, id int64, notif *xact.NotifXact, t
}

reb.fini(rargs, logHdr, err)
tstats.ClrFlag(stats.NodeStateFlags, cos.Rebalancing)
tstats.ClrFlag(stats.NodeAlerts, cos.Rebalancing)

offGFN()
}
Expand Down
12 changes: 6 additions & 6 deletions stats/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ const (
Uptime = "up.ns.time"

// KindGauge, cos.NodeStateFlags enum
NodeStateFlags = "state.flags"
NodeAlerts = cos.NodeAlerts // "state.flags"
)

// interfaces
Expand Down Expand Up @@ -281,7 +281,7 @@ func (r *runner) regCommon(snode *meta.Snode) {
)

// snode state flags
r.reg(snode, NodeStateFlags, KindGauge,
r.reg(snode, NodeAlerts, KindGauge,
&Extra{
Help: "bitwise 64-bit value that carries enumerated node-state flags, including warnings and alerts; " +
"see https://github.com/NVIDIA/aistore/blob/main/cmn/cos/node_state.go for details", // TODO: must have a readme
Expand Down Expand Up @@ -343,7 +343,7 @@ func (r *runner) Name() string { return r.name }
func (r *runner) Get(name string) (val int64) { return r.core.get(name) }

func (r *runner) nodeStateFlags() cos.NodeStateFlags {
val := r.Get(NodeStateFlags)
val := r.Get(NodeAlerts)
return cos.NodeStateFlags(val)
}

Expand Down Expand Up @@ -478,7 +478,7 @@ func (r *runner) _mem(mm *memsys.MMSA, set, clr cos.NodeStateFlags) {
default:
clr |= cos.OOM | cos.LowMemory
}
r.SetClrFlag(NodeStateFlags, set, clr)
r.SetClrFlag(NodeAlerts, set, clr)
}

func (r *runner) GetStats() *Node {
Expand Down Expand Up @@ -513,13 +513,13 @@ func (r *runner) checkNgr(now, lastNgr int64, goMaxProcs int) int64 {
ngr := runtime.NumGoroutine()
if ngr < lim {
if lastNgr != 0 {
r.ClrFlag(NodeStateFlags, cos.NumGoroutines)
r.ClrFlag(NodeAlerts, cos.NumGoroutines)
nlog.Infoln("Number of goroutines is now back to normal:", ngr)
}
return 0
}
if lastNgr == 0 {
r.SetFlag(NodeStateFlags, cos.NumGoroutines)
r.SetFlag(NodeAlerts, cos.NumGoroutines)
lastNgr = now
} else if d := time.Duration(now - lastNgr); (d >= ngrHighTime) || (ngr > lim<<1 && d >= ngrExtremeTime) {
lastNgr = now
Expand Down

0 comments on commit 1fe3c80

Please sign in to comment.