Skip to content

Commit

Permalink
filesystem health checker (fshc) version 2
Browse files Browse the repository at this point in the history
* at runtime: resolve (mpath, FS) to disks, and handle:
  - no disks
  - disk loss
  - new disk attachments
* part five, prev. commit: ccef808

Signed-off-by: Alex Aizman <[email protected]>
  • Loading branch information
alex-aizman committed Jul 12, 2024
1 parent a1b58cf commit 1ed2c1b
Show file tree
Hide file tree
Showing 16 changed files with 332 additions and 140 deletions.
2 changes: 1 addition & 1 deletion ais/target.go
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ func (t *target) init(config *cmn.Config) {
daemon.rg.add(t)

ts := stats.NewTrunner(t) // iostat below
startedUp := ts.Init(t) // reg common metrics (and target-only - via RegMetrics/regDiskMetrics below)
startedUp := ts.Init() // reg common metrics (and target-only - via RegMetrics/regDiskMetrics below)
daemon.rg.add(ts)
t.statsT = ts // stats tracker

Expand Down
4 changes: 3 additions & 1 deletion ais/tgtcp.go
Original file line number Diff line number Diff line change
Expand Up @@ -412,7 +412,9 @@ func (t *target) httpdaeget(w http.ResponseWriter, r *http.Request) {
diskStats = make(ios.AllDiskStats, fs.NumAvail())
config = cmn.GCO.Get()
)
fs.DiskStats(diskStats, config)
if mi, err := fs.DiskStats(diskStats, config); err != nil {
t.FSHC(err, mi, "")
}
t.writeJSON(w, r, diskStats, httpdaeWhat)
case apc.WhatRemoteAIS:
var (
Expand Down
25 changes: 14 additions & 11 deletions ais/tgtfshc.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,24 @@ import (

func (t *target) FSHC(err error, mi *fs.Mountpath, fqn string) {
config := cmn.GCO.Get()

if cmn.IsErrCapExceeded(err) {
cs := t.oos(config)
nlog.Errorf("%s: OOS (%s) via FSHC", t, cs.String())
return
}

if !config.FSHC.Enabled {
return
}
if !cos.IsIOError(err) { // TODO -- FIXME: review the selection
debug.Assert(!cos.IsErrOOS(err)) // is checked below
if !t.fshc.IsErr(err) {
if cmn.Rom.FastV(4, cos.SmoduleAIS) {
nlog.Warningln(err, "is not one of the error types to trigger FSHC, ignoring...")
}
return
}

s := fmt.Sprintf("waking up FSHC to check %q for err [%v]", fqn, err) // or maybe not (waking up)
s := fmt.Sprintf("waking up FSHC to check %s for [%v]", mi, err) // or maybe not (waking up)

if mi == nil {
mi, _, err = fs.FQN2Mpath(fqn)
Expand All @@ -45,19 +54,13 @@ func (t *target) FSHC(err error, mi *fs.Mountpath, fqn string) {
return
}

if cos.IsErrOOS(err) {
cs := t.oos(config)
nlog.Errorf("%s: OOS (%s), not %s", t, cs.String(), s)
return
}

if err := cos.Stat(mi.Path); err != nil {
// FATAL (unlikely)
cos.ExitLogf("%s: available %s fails fstat: %v", t, mi, err)
nlog.Errorf("[FATAL %s]: available %s is not: %v", t, mi, err)
}

// yes "waking up"
nlog.Errorln(t.String()+":", s)

//
// metrics: counting I/O errors on a per mountpath (`NameSuffix` below) basis
//
Expand Down
40 changes: 19 additions & 21 deletions cmn/cos/err_darwin.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// Package cmn provides common constants, types, and utilities for AIS clients
// and AIStore.
/*
* Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved.
*/
package cos

Expand All @@ -10,31 +10,29 @@ import (
"io"
"os"
"syscall"

"github.com/NVIDIA/aistore/cmn/debug"
)

var ioErrs = []error{
io.ErrShortWrite,

syscall.EIO, // I/O error
syscall.ENOTDIR, // mountpath is missing
syscall.EBUSY, // device or resource is busy
syscall.ENXIO, // No such device
syscall.EBADF, // Bad file number
syscall.ENODEV, // No such device
syscall.EROFS, // readonly filesystem
syscall.EDQUOT, // quota exceeded
syscall.ESTALE, // stale file handle
syscall.ENOSPC, // no space left
}

// Checks if the error is generated by any IO operation and if the error
// is severe enough to run the FSHC for mountpath testing
//
// For mountpath definition, see fs/mountfs.go
func IsIOError(err error) bool {
if err == nil {
return false
}

ioErrs := []error{
io.ErrShortWrite,

syscall.EIO, // I/O error
syscall.ENOTDIR, // mountpath is missing
syscall.EBUSY, // device or resource is busy
syscall.ENXIO, // No such device
syscall.EBADF, // Bad file number
syscall.ENODEV, // No such device
syscall.EROFS, // readonly filesystem
syscall.EDQUOT, // quota exceeded
syscall.ESTALE, // stale file handle
syscall.ENOSPC, // no space left
}
debug.Assert(err != nil)
for _, ioErr := range ioErrs {
if errors.Is(err, ioErr) {
return true
Expand Down
42 changes: 20 additions & 22 deletions cmn/cos/err_linux.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// Package cmn provides common constants, types, and utilities for AIS clients
// and AIStore.
/*
* Copyright (c) 2018-2021, NVIDIA CORPORATION. All rights reserved.
* Copyright (c) 2018-2024, NVIDIA CORPORATION. All rights reserved.
*/
package cos

Expand All @@ -10,32 +10,30 @@ import (
"io"
"os"
"syscall"

"github.com/NVIDIA/aistore/cmn/debug"
)

var ioErrs = []error{
io.ErrShortWrite,

syscall.EIO, // I/O error
syscall.ENOTDIR, // mountpath is missing
syscall.EBUSY, // device or resource is busy
syscall.ENXIO, // No such device
syscall.EBADF, // Bad file number
syscall.ENODEV, // No such device
syscall.EUCLEAN, // (mkdir)structure needs cleaning = broken filesystem
syscall.EROFS, // readonly filesystem
syscall.EDQUOT, // quota exceeded
syscall.ESTALE, // stale file handle
syscall.ENOSPC, // no space left
}

// Checks if the error is generated by any IO operation and if the error
// is severe enough to run the FSHC for mountpath testing
//
// For mountpath definition, see fs/mountfs.go
func IsIOError(err error) bool {
if err == nil {
return false
}

ioErrs := []error{
io.ErrShortWrite,

syscall.EIO, // I/O error
syscall.ENOTDIR, // mountpath is missing
syscall.EBUSY, // device or resource is busy
syscall.ENXIO, // No such device
syscall.EBADF, // Bad file number
syscall.ENODEV, // No such device
syscall.EUCLEAN, // (mkdir)structure needs cleaning = broken filesystem
syscall.EROFS, // readonly filesystem
syscall.EDQUOT, // quota exceeded
syscall.ESTALE, // stale file handle
syscall.ENOSPC, // no space left
}
debug.Assert(err != nil)
for _, ioErr := range ioErrs {
if errors.Is(err, ioErr) {
return true
Expand Down
77 changes: 76 additions & 1 deletion cmn/err.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,10 @@ type (
usedPct int32
oos bool
}
ErrGetCap struct {
err error
}

ErrBucketAccessDenied struct{ errAccessDenied }
ErrObjectAccessDenied struct{ errAccessDenied }
errAccessDenied struct {
Expand All @@ -128,6 +132,25 @@ type (
mpath string
cause string
}
ErrMountpathNoDisks struct {
mpath string
fs string
err error
}
ErrMountpathLostDisk struct {
mpath string
fs string
lostd string
disks []string
fsdisks []string
}
ErrMountpathNewDisk struct {
mpath string
fs string
disks []string
fsdisks []string
}

ErrInvalidFSPathsConf struct {
err error
}
Expand Down Expand Up @@ -455,7 +478,22 @@ func (e *ErrCapExceeded) Error() string {

func IsErrCapExceeded(err error) bool {
_, ok := err.(*ErrCapExceeded)
return ok || cos.IsErrOOS(err)
return ok || cos.IsErrOOS(err) // NOTE: a superset
}

// ErrGetCap

func NewErrGetCap(err error) *ErrGetCap {
return &ErrGetCap{err: err}
}

func (e *ErrGetCap) Error() string {
return fmt.Sprintf("failed to update capacity: %v", e.err)
}

func IsErrGetCap(err error) bool {
_, ok := err.(*ErrGetCap)
return ok
}

// ErrInvalidCksum
Expand Down Expand Up @@ -508,6 +546,43 @@ func NewErrInvalidaMountpath(mpath, cause string) *ErrInvalidMountpath {
return &ErrInvalidMountpath{mpath: mpath, cause: cause}
}

// ErrMountpathNoDisks

func NewErrMountpathNoDisks(mpath, fs string, err error) *ErrMountpathNoDisks {
return &ErrMountpathNoDisks{mpath: mpath, fs: fs, err: err}
}

func (e *ErrMountpathNoDisks) Error() string {
return fmt.Sprintf("mp[%s, fs=%s] has no disks, err: %v", e.mpath, e.fs, e.err)
}

// ErrMountpathLostDisk

func NewErrMountpathLostDisk(mpath, fs, lostd string, disks, fsdisks []string) *ErrMountpathLostDisk {
return &ErrMountpathLostDisk{mpath: mpath, fs: fs, lostd: lostd, disks: disks, fsdisks: fsdisks}
}

func (e *ErrMountpathLostDisk) Error() string {
return fmt.Sprintf("mp[%s, fs=%s]: disk %q is lost (orig: %v, available now: %v)", e.mpath, e.fs, e.lostd, e.disks, e.fsdisks)
}

// ErrMountpathNewDisk

func NewErrMountpathNewDisk(mpath, fs string, disks, fsdisks []string) *ErrMountpathNewDisk {
return &ErrMountpathNewDisk{mpath: mpath, fs: fs, disks: disks, fsdisks: fsdisks}
}

func (e *ErrMountpathNewDisk) Error() string {
plural := len(e.fsdisks) - len(e.disks)
return fmt.Sprintf("mp[%s, fs=%s]: newly attached disk%s (orig: %v, available now: %v)",
e.mpath, e.fs, cos.Plural(plural), e.disks, e.fsdisks)
}

func IsErrMountpathNewDisk(err error) bool {
_, ok := err.(*ErrMountpathNewDisk)
return ok
}

// ErrInvalidFSPathsConf

func NewErrInvalidFSPathsConf(err error) *ErrInvalidFSPathsConf {
Expand Down
13 changes: 9 additions & 4 deletions core/mock/iostat_mock.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,16 @@ type IOS struct {
}

func NewIOS() *IOS { return &IOS{} }
func (*IOS) Clblk() {}
func (m *IOS) GetAllMpathUtils() *ios.MpathUtil { return &m.Utils }
func (m *IOS) GetMpathUtil(mpath string) int64 { return m.Utils.Get(mpath) }

func (*IOS) AddMpath(string, string, ios.Label, *cmn.Config) (ios.FsDisks, error) { return nil, nil }
func (*IOS) HealthMpath(string) error { return nil }
func (*IOS) RemoveMpath(string, bool) {}
func (*IOS) LogAppend(l []string) []string { return l }
func (*IOS) DiskStats(ios.AllDiskStats) {}

func (*IOS) RefreshDisks(string, string, []string) ios.RefreshDisksResult {
return ios.RefreshDisksResult{}
}

func (*IOS) RemoveMpath(string, bool) {}
func (*IOS) LogAppend(l []string) []string { return l }
func (*IOS) DiskStats(ios.AllDiskStats) {}
8 changes: 2 additions & 6 deletions core/target.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,19 +78,15 @@ type (
)

type (
NodeCapacity interface {
// a node that can also write objects
TargetPut interface {
Node

// Space
OOS(*fs.CapStatus, *cmn.Config, *fs.TargetCDF) fs.CapStatus

// xactions (jobs) now
GetAllRunning(inout *AllRunningInOut, periodic bool)
}

// a node that can also write objects
TargetPut interface {
NodeCapacity

// PUT params.Reader => lom
PutObject(lom *LOM, params *PutParams) (err error)
Expand Down
5 changes: 0 additions & 5 deletions fs/err.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,8 @@ type (
Msg string
Code int // Sie* enum above
}
ErrMountpathNoDisks struct {
Mi *Mountpath
}
)

func (e *ErrMountpathNoDisks) Error() string { return fmt.Sprintf("%s has no disks", e.Mi) }

func (sie *ErrStorageIntegrity) Error() string {
err := fmt.Errorf(cmn.FmtErrIntegrity, siePrefix, sie.Code, cmn.GitHubHome)
return fmt.Sprintf("%v: %s", err, sie.Msg)
Expand Down
Loading

0 comments on commit 1ed2c1b

Please sign in to comment.