Skip to content

Commit

Permalink
filesystem health checker (fshc) version 2
Browse files Browse the repository at this point in the history
* CLI 'storage mountpath' to show alerts
* part three, prev. commit: 0319347

Signed-off-by: Alex Aizman <[email protected]>
  • Loading branch information
alex-aizman committed Jul 3, 2024
1 parent 0319347 commit bda7bc9
Show file tree
Hide file tree
Showing 7 changed files with 36 additions and 21 deletions.
2 changes: 1 addition & 1 deletion cmd/cli/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ module github.com/NVIDIA/aistore/cmd/cli
go 1.22.3

require (
github.com/NVIDIA/aistore v1.3.24-0.20240628194324-779a7b9f201e
github.com/NVIDIA/aistore v1.3.24-0.20240703181018-0319347b451e
github.com/fatih/color v1.17.0
github.com/json-iterator/go v1.1.12
github.com/onsi/ginkgo/v2 v2.19.0
Expand Down
4 changes: 2 additions & 2 deletions cmd/cli/go.sum
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
code.cloudfoundry.org/bytefmt v0.0.0-20190710193110-1eb035ffe2b6/go.mod h1:wN/zk7mhREp/oviagqUXY3EwuHhWyOvAdsn5Y4CzOrc=
github.com/BurntSushi/toml v1.3.2/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ=
github.com/NVIDIA/aistore v1.3.24-0.20240628194324-779a7b9f201e h1:lxfuXXJ5XI+M3VZhZyuSg4RZ209cgnLE4Pd2sw+y/lM=
github.com/NVIDIA/aistore v1.3.24-0.20240628194324-779a7b9f201e/go.mod h1:rzuE/hzSFxylpF5sfawzy1DPnkmWchiW11nb1omitq8=
github.com/NVIDIA/aistore v1.3.24-0.20240703181018-0319347b451e h1:yx6jqXtcseRjSvvm5lhqKyKRPNHllXSP2RFPHO0sxu8=
github.com/NVIDIA/aistore v1.3.24-0.20240703181018-0319347b451e/go.mod h1:rzuE/hzSFxylpF5sfawzy1DPnkmWchiW11nb1omitq8=
github.com/OneOfOne/xxhash v1.2.8 h1:31czK/TI9sNkxIKfaUfGlU47BAxQ0ztGgd9vPyqimf8=
github.com/OneOfOne/xxhash v1.2.8/go.mod h1:eZbhyaAYD41SGSSsnmcpxVoRiQ/MPUTjUdIIOT9Um7Q=
github.com/VividCortex/ewma v1.1.1/go.mod h1:2Tkkvm3sRDVXaiyucHiACn4cqf7DpdyLvmxzcbUokwA=
Expand Down
3 changes: 2 additions & 1 deletion cmd/cli/teb/templates.go
Original file line number Diff line number Diff line change
Expand Up @@ -346,7 +346,7 @@ See '--help' and docs/cli for details.`
"\t\t{{ $mp }} " +

"{{range $k, $v := $p.TargetCDF.Mountpaths}}" +
"{{if (IsEqS $k $mp)}}{{$v.FS}}{{end}}" +
"{{if (IsEqS $k $mp)}}{{FormatCDFDisks $v}}{{end}}" +
"{{end}}\n" +

"{{end}}{{end}}" +
Expand Down Expand Up @@ -421,6 +421,7 @@ var (
"FormatProxiesSumm": fmtProxiesSumm,
"FormatTargetsSumm": fmtTargetsSumm,
"FormatCapPctMAM": fmtCapPctMAM,
"FormatCDFDisks": fmtCDFDisks,
"FormatFloat": func(f float64) string { return fmt.Sprintf("%.2f", f) },
"FormatBool": FmtBool,
"FormatBckName": func(bck cmn.Bck) string { return bck.Cname("") },
Expand Down
8 changes: 8 additions & 0 deletions cmd/cli/teb/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,14 @@ func fmtCapPctMAM(tcdf *fs.TargetCDF, list bool) string {
return fmt.Sprintf("%s%2d%%%s %s%2d%%%s %s%2d%%", a, tcdf.PctMin, sepa, b, tcdf.PctAvg, sepa, c, tcdf.PctMax)
}

func fmtCDFDisks(cdf *fs.CDF) string {
alert, _ := cdf.HasAlert()
if alert == "" {
return cdf.FS.String() // fs.Fs + "(" + fs.FsType + ")"
}
return cdf.FS.Fs + fred(alert)
}

func fmtSmap(smap *meta.Smap) string {
return fmt.Sprintf("version %d, UUID %s, primary %s", smap.Version, smap.UUID, smap.Primary.StringEx())
}
Expand Down
4 changes: 3 additions & 1 deletion fs/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,8 @@ func (cdf *CDF) HasAlert() (alert string, idx int) {

func (cdf *CDF) _alert(a string) {
for i, d := range cdf.Disks {
cdf.Disks[i] = d + a
if !strings.Contains(d, a) {
cdf.Disks[i] = d + a
}
}
}
2 changes: 1 addition & 1 deletion fs/fs.go
Original file line number Diff line number Diff line change
Expand Up @@ -1152,7 +1152,7 @@ func CapRefresh(config *cmn.Config, tcdf *TargetCDF) (cs CapStatus, _, errCap er
cdf.Capacity = c

// add alerts
// (not mutually exclusive, but we add only one here in order of priority)
// (the bits are not mutually exclusive, but we add only one here in the order of priority)
switch {
case mi.IsAnySet(FlagDisabledByFSHC):
cdf._alert(DiskFaulted)
Expand Down
34 changes: 19 additions & 15 deletions stats/target_stats.go
Original file line number Diff line number Diff line change
Expand Up @@ -342,16 +342,8 @@ func (r *Trunner) log(now int64, uptime time.Duration, config *cmn.Config) {
}
}

// 3. capacity and associated node state flags
set, clr, hasAlerts := r._cap(config, now)

// 3.5. TODO -- FIXME: revisit
flags := r.nodeStateFlags()
if hasAlerts {
r.lines = append(r.lines, "Warning: check for mountpath alerts, node-flags: "+flags.String())
} else if flags.IsSet(cos.DiskFault) {
clr |= cos.DiskFault
}
// 3. capacity, mountpath alerts, and associated node state flags
set, clr := r._cap(config, now)

// 4. append disk stats to log subject to (idle) filtering (see related: `ignoreIdle`)
r.logDiskStats(now)
Expand Down Expand Up @@ -397,17 +389,21 @@ func (r *Trunner) log(now int64, uptime time.Duration, config *cmn.Config) {
}
}

func (r *Trunner) _cap(config *cmn.Config, now int64) (set, clr cos.NodeStateFlags, hasAlerts bool) {
func (r *Trunner) _cap(config *cmn.Config, now int64) (set, clr cos.NodeStateFlags) {
cs, updated, err, errCap := fs.CapPeriodic(now, config, &r.TargetCDF)
if err != nil {
nlog.Errorln(err)
debug.Assert(!updated && errCap == nil, updated, " ", errCap)
return 0, 0, false
return 0, 0
}
if !updated && errCap == nil { // nothing to do
return 0, 0, false
return 0, 0
}
pcs := &cs

var (
pcs = &cs
hasAlerts bool
)
if !updated {
pcs = nil // to possibly force refresh via t.OOS
} else {
Expand Down Expand Up @@ -456,6 +452,14 @@ func (r *Trunner) _cap(config *cmn.Config, now int64) (set, clr cos.NodeStateFla
r.cs.last = now
}

// and more
flags := r.nodeStateFlags()
if hasAlerts {
r.lines = append(r.lines, "Warning: node-state-flags", flags.String(), "(check mountpath alerts!)")
} else if flags.IsSet(cos.DiskFault) && updated {
clr |= cos.DiskFault
}

// cap alert
if cs.IsOOS() {
set = cos.OOS
Expand All @@ -465,7 +469,7 @@ func (r *Trunner) _cap(config *cmn.Config, now int64) (set, clr cos.NodeStateFla
} else {
clr = cos.OOS | cos.LowCapacity
}
return set, clr, hasAlerts
return set, clr
}

// log formatted disk stats:
Expand Down

0 comments on commit bda7bc9

Please sign in to comment.