Skip to content

Commit

Permalink
Merge pull request #9680 from filecoin-project/feat/config-post-check…
Browse files Browse the repository at this point in the history
…-timeout

feat: wdpost: Configurable pre-check timeouts
  • Loading branch information
magik6k authored Nov 17, 2022
2 parents 716b981 + 71a84bb commit 60c79c7
Show file tree
Hide file tree
Showing 6 changed files with 87 additions and 7 deletions.
23 changes: 23 additions & 0 deletions documentation/en/default-lotus-miner-config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,29 @@
# env var: LOTUS_PROVING_PARALLELCHECKLIMIT
#ParallelCheckLimit = 128

# Maximum amount of time a proving pre-check can take for a sector. If the check times out the sector will be skipped
#
# WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the
# test challenge took longer than this timeout
# WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this sector are
# blocked (e.g. in case of disconnected NFS mount)
#
# type: Duration
# env var: LOTUS_PROVING_SINGLECHECKTIMEOUT
#SingleCheckTimeout = "10m0s"

# Maximum amount of time a proving pre-check can take for an entire partition. If the check times out, sectors in
# the partition which didn't get checked on time will be skipped
#
# WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the
# test challenge took longer than this timeout
# WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this partition are
# blocked or slow
#
# type: Duration
# env var: LOTUS_PROVING_PARTITIONCHECKTIMEOUT
#PartitionCheckTimeout = "20m0s"

# Disable Window PoSt computation on the lotus-miner process even if no window PoSt workers are present.
#
# WARNING: If no windowPoSt workers are connected, window PoSt WILL FAIL resulting in faulty sectors which will need
Expand Down
4 changes: 3 additions & 1 deletion node/config/def.go
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,9 @@ func DefaultStorageMiner() *StorageMiner {
},

Proving: ProvingConfig{
ParallelCheckLimit: 128,
ParallelCheckLimit: 128,
PartitionCheckTimeout: Duration(20 * time.Minute),
SingleCheckTimeout: Duration(10 * time.Minute),
},

Storage: SealerConfig{
Expand Down
23 changes: 23 additions & 0 deletions node/config/doc_gen.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 17 additions & 0 deletions node/config/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,23 @@ type ProvingConfig struct {
// 'lotus-miner proving compute window-post 0'
ParallelCheckLimit int

// Maximum amount of time a proving pre-check can take for a sector. If the check times out the sector will be skipped
//
// WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the
// test challenge took longer than this timeout
// WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this sector are
// blocked (e.g. in case of disconnected NFS mount)
SingleCheckTimeout Duration

// Maximum amount of time a proving pre-check can take for an entire partition. If the check times out, sectors in
// the partition which didn't get checked on time will be skipped
//
// WARNING: Setting this value too low risks in sectors being skipped even though they are accessible, just reading the
// test challenge took longer than this timeout
// WARNING: Setting this value too high risks missing PoSt deadline in case IO operations related to this partition are
// blocked or slow
PartitionCheckTimeout Duration

// Disable Window PoSt computation on the lotus-miner process even if no window PoSt workers are present.
//
// WARNING: If no windowPoSt workers are connected, window PoSt WILL FAIL resulting in faulty sectors which will need
Expand Down
22 changes: 16 additions & 6 deletions storage/sealer/faults.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ import (
"crypto/rand"
"fmt"
"sync"
"time"

"golang.org/x/xerrors"

Expand All @@ -15,8 +14,6 @@ import (
"github.com/filecoin-project/lotus/storage/sealer/storiface"
)

var PostCheckTimeout = 160 * time.Second

// FaultTracker TODO: Track things more actively
type FaultTracker interface {
CheckProvable(ctx context.Context, pp abi.RegisteredPoStProof, sectors []storiface.SectorRef, rg storiface.RGetter) (map[abi.SectorID]string, error)
Expand Down Expand Up @@ -50,14 +47,22 @@ func (m *Manager) CheckProvable(ctx context.Context, pp abi.RegisteredPoStProof,
badLk.Unlock()
}

if m.partitionCheckTimeout > 0 {
var cancel2 context.CancelFunc
ctx, cancel2 = context.WithTimeout(ctx, m.partitionCheckTimeout)
defer cancel2()
}

var wg sync.WaitGroup
wg.Add(len(sectors))

for _, sector := range sectors {
select {
case throttle <- struct{}{}:
case <-ctx.Done():
return nil, ctx.Err()
addBad(sector.ID, fmt.Sprintf("waiting for check worker: %s", ctx.Err()))
wg.Done()
continue
}

go func(sector storiface.SectorRef) {
Expand Down Expand Up @@ -107,8 +112,13 @@ func (m *Manager) CheckProvable(ctx context.Context, pp abi.RegisteredPoStProof,
return
}

vctx, cancel2 := context.WithTimeout(ctx, PostCheckTimeout)
defer cancel2()
vctx := ctx

if m.singleCheckTimeout > 0 {
var cancel2 context.CancelFunc
vctx, cancel2 = context.WithTimeout(ctx, m.singleCheckTimeout)
defer cancel2()
}

_, err = m.storage.GenerateSingleVanillaProof(vctx, sector.ID.Miner, storiface.PostSectorChallenge{
SealProof: sector.ProofType,
Expand Down
5 changes: 5 additions & 0 deletions storage/sealer/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"net/http"
"sort"
"sync"
"time"

"github.com/google/uuid"
"github.com/hashicorp/go-multierror"
Expand Down Expand Up @@ -72,6 +73,8 @@ type Manager struct {
work *statestore.StateStore

parallelCheckLimit int
singleCheckTimeout time.Duration
partitionCheckTimeout time.Duration
disableBuiltinWindowPoSt bool
disableBuiltinWinningPoSt bool
disallowRemoteFinalize bool
Expand Down Expand Up @@ -121,6 +124,8 @@ func New(ctx context.Context, lstor *paths.Local, stor paths.Store, ls paths.Loc
localProver: prover,

parallelCheckLimit: pc.ParallelCheckLimit,
singleCheckTimeout: time.Duration(pc.SingleCheckTimeout),
partitionCheckTimeout: time.Duration(pc.PartitionCheckTimeout),
disableBuiltinWindowPoSt: pc.DisableBuiltinWindowPoSt,
disableBuiltinWinningPoSt: pc.DisableBuiltinWinningPoSt,
disallowRemoteFinalize: sc.DisallowRemoteFinalize,
Expand Down

0 comments on commit 60c79c7

Please sign in to comment.