diff --git a/.circleci/config.yml b/.circleci/config.yml index 52a37a59fa3..914fa2ffddb 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -958,6 +958,11 @@ workflows: suite: itest-verifreg target: "./itests/verifreg_test.go" + - test: + name: test-itest-wdpost_check_config + suite: itest-wdpost_check_config + target: "./itests/wdpost_check_config_test.go" + - test: name: test-itest-wdpost_dispute suite: itest-wdpost_dispute @@ -968,6 +973,11 @@ workflows: suite: itest-wdpost target: "./itests/wdpost_test.go" + - test: + name: test-itest-wdpost_worker_config + suite: itest-wdpost_worker_config + target: "./itests/wdpost_worker_config_test.go" + - test: name: test-itest-worker suite: itest-worker diff --git a/documentation/en/default-lotus-config.toml b/documentation/en/default-lotus-config.toml index ff4be96c8af..d33abd3c251 100644 --- a/documentation/en/default-lotus-config.toml +++ b/documentation/en/default-lotus-config.toml @@ -15,6 +15,9 @@ [Backup] + # When set to true disables metadata log (.lotus/kvlog). This can save disk + # space by reducing metadata redundancy. + # # Note that in case of metadata corruption it might be much harder to recover # your node if metadata log is disabled # diff --git a/documentation/en/default-lotus-miner-config.toml b/documentation/en/default-lotus-miner-config.toml index 44a9117e3e6..b3992690499 100644 --- a/documentation/en/default-lotus-miner-config.toml +++ b/documentation/en/default-lotus-miner-config.toml @@ -15,6 +15,9 @@ [Backup] + # When set to true disables metadata log (.lotus/kvlog). This can save disk + # space by reducing metadata redundancy. + # # Note that in case of metadata corruption it might be much harder to recover # your node if metadata log is disabled # @@ -310,11 +313,68 @@ [Proving] # Maximum number of sector checks to run in parallel. (0 = unlimited) + # + # WARNING: Setting this value too high may make the node crash by running out of stack + # WARNING: Setting this value too low may make sector challenge reading much slower, resulting in failed PoSt due + # to late submission. + # + # After changing this option, confirm that the new value works in your setup by invoking + # 'lotus-miner proving compute window-post 0' # # type: int # env var: LOTUS_PROVING_PARALLELCHECKLIMIT #ParallelCheckLimit = 128 + # Disable Window PoSt computation on the lotus-miner process even if no window PoSt workers are present. + # + # WARNING: If no windowPoSt workers are connected, window PoSt WILL FAIL resulting in faulty sectors which will need + # to be recovered. Before enabling this option, make sure your PoSt workers work correctly. + # + # After changing this option, confirm that the new value works in your setup by invoking + # 'lotus-miner proving compute window-post 0' + # + # type: bool + # env var: LOTUS_PROVING_DISABLEBUILTINWINDOWPOST + #DisableBuiltinWindowPoSt = false + + # Disable Winning PoSt computation on the lotus-miner process even if no winning PoSt workers are present. + # + # WARNING: If no WinningPoSt workers are connected, Winning PoSt WILL FAIL resulting in lost block rewards. + # Before enabling this option, make sure your PoSt workers work correctly. + # + # type: bool + # env var: LOTUS_PROVING_DISABLEBUILTINWINNINGPOST + #DisableBuiltinWinningPoSt = false + + # Disable WindowPoSt provable sector readability checks. + # + # In normal operation, when preparing to compute WindowPoSt, lotus-miner will perform a round of reading challenges + # from all sectors to confirm that those sectors can be proven. Challenges read in this process are discarded, as + # we're only interested in checking that sector data can be read. + # + # When using builtin proof computation (no PoSt workers, and DisableBuiltinWindowPoSt is set to false), this process + # can save a lot of time and compute resources in the case that some sectors are not readable - this is caused by + # the builtin logic not skipping snark computation when some sectors need to be skipped. + # + # When using PoSt workers, this process is mostly redundant, with PoSt workers challenges will be read once, and + # if challenges for some sectors aren't readable, those sectors will just get skipped. + # + # Disabling sector pre-checks will slightly reduce IO load when proving sectors, possibly resulting in shorter + # time to produce window PoSt. In setups with good IO capabilities the effect of this option on proving time should + # be negligible. + # + # NOTE: It likely is a bad idea to disable sector pre-checks in setups with no PoSt workers. + # + # NOTE: Even when this option is enabled, recovering sectors will be checked before recovery declaration message is + # sent to the chain + # + # After changing this option, confirm that the new value works in your setup by invoking + # 'lotus-miner proving compute window-post 0' + # + # type: bool + # env var: LOTUS_PROVING_DISABLEWDPOSTPRECHECKS + #DisableWDPoStPreChecks = false + [Sealing] # Upper bound on how many sectors can be waiting for more deals to be packed in it before it begins sealing at any given time. diff --git a/itests/wdpost_check_config_test.go b/itests/wdpost_check_config_test.go new file mode 100644 index 00000000000..cc10af04f9f --- /dev/null +++ b/itests/wdpost_check_config_test.go @@ -0,0 +1,188 @@ +package itests + +import ( + "context" + "testing" + "time" + + "github.com/stretchr/testify/require" + + "github.com/filecoin-project/go-address" + "github.com/filecoin-project/go-state-types/abi" + + "github.com/filecoin-project/lotus/chain/types" + "github.com/filecoin-project/lotus/itests/kit" + "github.com/filecoin-project/lotus/node" + "github.com/filecoin-project/lotus/node/config" + "github.com/filecoin-project/lotus/node/impl" + "github.com/filecoin-project/lotus/node/modules" + "github.com/filecoin-project/lotus/storage/sealer/mock" + "github.com/filecoin-project/lotus/storage/sealer/storiface" + "github.com/filecoin-project/lotus/storage/wdpost" +) + +func TestWindowPostNoPreChecks(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + client, miner, ens := kit.EnsembleMinimal(t, + kit.LatestActorsAt(-1), + kit.MockProofs(), + kit.ConstructorOpts( + node.Override(new(*wdpost.WindowPoStScheduler), modules.WindowPostScheduler( + config.DefaultStorageMiner().Fees, + config.ProvingConfig{ + DisableWDPoStPreChecks: true, + }, + )))) + ens.InterconnectAll().BeginMining(2 * time.Millisecond) + + nSectors := 10 + + miner.PledgeSectors(ctx, nSectors, 0, nil) + + maddr, err := miner.ActorAddress(ctx) + require.NoError(t, err) + di, err := client.StateMinerProvingDeadline(ctx, maddr, types.EmptyTSK) + require.NoError(t, err) + + mid, err := address.IDFromAddress(maddr) + require.NoError(t, err) + + t.Log("Running one proving period") + waitUntil := di.Open + di.WPoStProvingPeriod + t.Logf("End for head.Height > %d", waitUntil) + + ts := client.WaitTillChain(ctx, kit.HeightAtLeast(waitUntil)) + t.Logf("Now head.Height = %d", ts.Height()) + + p, err := client.StateMinerPower(ctx, maddr, types.EmptyTSK) + require.NoError(t, err) + + ssz, err := miner.ActorSectorSize(ctx, maddr) + require.NoError(t, err) + + require.Equal(t, p.MinerPower, p.TotalPower) + require.Equal(t, p.MinerPower.RawBytePower, types.NewInt(uint64(ssz)*uint64(nSectors+kit.DefaultPresealsPerBootstrapMiner))) + + t.Log("Drop some sectors") + + // Drop 2 sectors from deadline 2 partition 0 (full partition / deadline) + { + parts, err := client.StateMinerPartitions(ctx, maddr, 2, types.EmptyTSK) + require.NoError(t, err) + require.Greater(t, len(parts), 0) + + secs := parts[0].AllSectors + n, err := secs.Count() + require.NoError(t, err) + require.Equal(t, uint64(2), n) + + // Drop the partition + err = secs.ForEach(func(sid uint64) error { + return miner.StorageMiner.(*impl.StorageMinerAPI).IStorageMgr.(*mock.SectorMgr).MarkCorrupted(storiface.SectorRef{ + ID: abi.SectorID{ + Miner: abi.ActorID(mid), + Number: abi.SectorNumber(sid), + }, + }, true) + }) + require.NoError(t, err) + } + + var s storiface.SectorRef + + // Drop 1 sectors from deadline 3 partition 0 + { + parts, err := client.StateMinerPartitions(ctx, maddr, 3, types.EmptyTSK) + require.NoError(t, err) + require.Greater(t, len(parts), 0) + + secs := parts[0].AllSectors + n, err := secs.Count() + require.NoError(t, err) + require.Equal(t, uint64(2), n) + + // Drop the sector + sn, err := secs.First() + require.NoError(t, err) + + all, err := secs.All(2) + require.NoError(t, err) + t.Log("the sectors", all) + + s = storiface.SectorRef{ + ID: abi.SectorID{ + Miner: abi.ActorID(mid), + Number: abi.SectorNumber(sn), + }, + } + + err = miner.StorageMiner.(*impl.StorageMinerAPI).IStorageMgr.(*mock.SectorMgr).MarkFailed(s, true) + require.NoError(t, err) + } + + di, err = client.StateMinerProvingDeadline(ctx, maddr, types.EmptyTSK) + require.NoError(t, err) + + t.Log("Go through another PP, wait for sectors to become faulty") + waitUntil = di.Open + di.WPoStProvingPeriod + t.Logf("End for head.Height > %d", waitUntil) + + ts = client.WaitTillChain(ctx, kit.HeightAtLeast(waitUntil)) + t.Logf("Now head.Height = %d", ts.Height()) + + p, err = client.StateMinerPower(ctx, maddr, types.EmptyTSK) + require.NoError(t, err) + + require.Equal(t, p.MinerPower, p.TotalPower) + + sectors := p.MinerPower.RawBytePower.Uint64() / uint64(ssz) + require.Equal(t, nSectors+kit.DefaultPresealsPerBootstrapMiner-3, int(sectors)) // -3 just removed sectors + + t.Log("Recover one sector") + + err = miner.StorageMiner.(*impl.StorageMinerAPI).IStorageMgr.(*mock.SectorMgr).MarkFailed(s, false) + require.NoError(t, err) + + di, err = client.StateMinerProvingDeadline(ctx, maddr, types.EmptyTSK) + require.NoError(t, err) + + waitUntil = di.Open + di.WPoStProvingPeriod + t.Logf("End for head.Height > %d", waitUntil) + + ts = client.WaitTillChain(ctx, kit.HeightAtLeast(waitUntil)) + t.Logf("Now head.Height = %d", ts.Height()) + + p, err = client.StateMinerPower(ctx, maddr, types.EmptyTSK) + require.NoError(t, err) + + require.Equal(t, p.MinerPower, p.TotalPower) + + sectors = p.MinerPower.RawBytePower.Uint64() / uint64(ssz) + require.Equal(t, nSectors+kit.DefaultPresealsPerBootstrapMiner-2, int(sectors)) // -2 not recovered sectors + + // pledge a sector after recovery + + miner.PledgeSectors(ctx, 1, nSectors, nil) + + { + // Wait until proven. + di, err = client.StateMinerProvingDeadline(ctx, maddr, types.EmptyTSK) + require.NoError(t, err) + + waitUntil := di.Open + di.WPoStProvingPeriod + t.Logf("End for head.Height > %d\n", waitUntil) + + ts := client.WaitTillChain(ctx, kit.HeightAtLeast(waitUntil)) + t.Logf("Now head.Height = %d", ts.Height()) + } + + p, err = client.StateMinerPower(ctx, maddr, types.EmptyTSK) + require.NoError(t, err) + + require.Equal(t, p.MinerPower, p.TotalPower) + + sectors = p.MinerPower.RawBytePower.Uint64() / uint64(ssz) + require.Equal(t, nSectors+kit.DefaultPresealsPerBootstrapMiner-2+1, int(sectors)) // -2 not recovered sectors + 1 just pledged +} diff --git a/itests/wdpost_worker_config_test.go b/itests/wdpost_worker_config_test.go new file mode 100644 index 00000000000..a84896eae1a --- /dev/null +++ b/itests/wdpost_worker_config_test.go @@ -0,0 +1,201 @@ +package itests + +import ( + "context" + "testing" + "time" + + logging "github.com/ipfs/go-log/v2" + "github.com/stretchr/testify/require" + + "github.com/filecoin-project/go-address" + "github.com/filecoin-project/go-state-types/abi" + + "github.com/filecoin-project/lotus/chain/types" + "github.com/filecoin-project/lotus/itests/kit" + "github.com/filecoin-project/lotus/node" + "github.com/filecoin-project/lotus/node/config" + "github.com/filecoin-project/lotus/node/impl" + "github.com/filecoin-project/lotus/node/modules" + "github.com/filecoin-project/lotus/storage/sealer" + "github.com/filecoin-project/lotus/storage/sealer/sealtasks" + "github.com/filecoin-project/lotus/storage/sealer/storiface" + "github.com/filecoin-project/lotus/storage/wdpost" +) + +func TestWindowPostNoBuiltinWindow(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + _ = logging.SetLogLevel("storageminer", "INFO") + + sectors := 2 * 48 * 2 + + client, miner, _, ens := kit.EnsembleWorker(t, + kit.PresealSectors(sectors), // 2 sectors per partition, 2 partitions in all 48 deadlines + kit.LatestActorsAt(-1), + kit.ConstructorOpts( + node.Override(new(sealer.Config), func() sealer.Config { + c := config.DefaultStorageMiner().StorageManager() + c.DisableBuiltinWindowPoSt = true + return c + }), + node.Override(new(*wdpost.WindowPoStScheduler), modules.WindowPostScheduler( + config.DefaultStorageMiner().Fees, + config.ProvingConfig{ + DisableWDPoStPreChecks: false, + }, + ))), + kit.ThroughRPC()) // generic non-post worker + + maddr, err := miner.ActorAddress(ctx) + require.NoError(t, err) + + di, err := client.StateMinerProvingDeadline(ctx, maddr, types.EmptyTSK) + require.NoError(t, err) + + bm := ens.InterconnectAll().BeginMining(2 * time.Millisecond)[0] + + di = di.NextNotElapsed() + + t.Log("Running one proving period") + waitUntil := di.Open + di.WPoStChallengeWindow*2 + wdpost.SubmitConfidence + client.WaitTillChain(ctx, kit.HeightAtLeast(waitUntil)) + + t.Log("Waiting for post message") + bm.Stop() + + var lastPending []*types.SignedMessage + for i := 0; i < 500; i++ { + lastPending, err = client.MpoolPending(ctx, types.EmptyTSK) + require.NoError(t, err) + + if len(lastPending) > 0 { + break + } + time.Sleep(10 * time.Millisecond) + } + + // expect no post messages + require.Equal(t, len(lastPending), 0) +} + +func TestWindowPostNoBuiltinWindowWithWorker(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + _ = logging.SetLogLevel("storageminer", "INFO") + + sectors := 2 * 48 * 2 + + client, miner, _, ens := kit.EnsembleWorker(t, + kit.PresealSectors(sectors), // 2 sectors per partition, 2 partitions in all 48 deadlines + kit.LatestActorsAt(-1), + kit.ConstructorOpts( + node.Override(new(sealer.Config), func() sealer.Config { + c := config.DefaultStorageMiner().StorageManager() + c.DisableBuiltinWindowPoSt = true + return c + }), + node.Override(new(*wdpost.WindowPoStScheduler), modules.WindowPostScheduler( + config.DefaultStorageMiner().Fees, + config.ProvingConfig{ + DisableBuiltinWindowPoSt: true, + DisableBuiltinWinningPoSt: false, + DisableWDPoStPreChecks: false, + }, + ))), + kit.ThroughRPC(), + kit.WithTaskTypes([]sealtasks.TaskType{sealtasks.TTGenerateWindowPoSt})) + + maddr, err := miner.ActorAddress(ctx) + require.NoError(t, err) + + di, err := client.StateMinerProvingDeadline(ctx, maddr, types.EmptyTSK) + require.NoError(t, err) + + bm := ens.InterconnectAll().BeginMining(2 * time.Millisecond)[0] + + di = di.NextNotElapsed() + + t.Log("Running one proving period") + waitUntil := di.Open + di.WPoStChallengeWindow*2 + wdpost.SubmitConfidence + client.WaitTillChain(ctx, kit.HeightAtLeast(waitUntil)) + + t.Log("Waiting for post message") + bm.Stop() + + var lastPending []*types.SignedMessage + for i := 0; i < 500; i++ { + lastPending, err = client.MpoolPending(ctx, types.EmptyTSK) + require.NoError(t, err) + + if len(lastPending) > 0 { + break + } + time.Sleep(40 * time.Millisecond) + } + + require.Greater(t, len(lastPending), 0) + + t.Log("post message landed") + + bm.MineBlocks(ctx, 2*time.Millisecond) + + waitUntil = di.Open + di.WPoStChallengeWindow*3 + t.Logf("End for head.Height > %d", waitUntil) + + ts := client.WaitTillChain(ctx, kit.HeightAtLeast(waitUntil)) + t.Logf("Now head.Height = %d", ts.Height()) + + p, err := client.StateMinerPower(ctx, maddr, types.EmptyTSK) + require.NoError(t, err) + + ssz, err := miner.ActorSectorSize(ctx, maddr) + require.NoError(t, err) + + require.Equal(t, p.MinerPower, p.TotalPower) + require.Equal(t, p.MinerPower.RawBytePower, types.NewInt(uint64(ssz)*uint64(sectors))) + + mid, err := address.IDFromAddress(maddr) + require.NoError(t, err) + + di, err = client.StateMinerProvingDeadline(ctx, maddr, types.EmptyTSK) + require.NoError(t, err) + + // Remove one sector in the next deadline (so it's skipped) + { + parts, err := client.StateMinerPartitions(ctx, maddr, di.Index+1, types.EmptyTSK) + require.NoError(t, err) + require.Greater(t, len(parts), 0) + + secs := parts[0].AllSectors + n, err := secs.Count() + require.NoError(t, err) + require.Equal(t, uint64(2), n) + + // Drop the sector + sid, err := secs.First() + require.NoError(t, err) + + t.Logf("Drop sector %d; dl %d part %d", sid, di.Index+1, 0) + + err = miner.BaseAPI.(*impl.StorageMinerAPI).IStorageMgr.Remove(ctx, storiface.SectorRef{ + ID: abi.SectorID{ + Miner: abi.ActorID(mid), + Number: abi.SectorNumber(sid), + }, + }) + require.NoError(t, err) + } + + waitUntil = di.Close + di.WPoStChallengeWindow + ts = client.WaitTillChain(ctx, kit.HeightAtLeast(waitUntil)) + t.Logf("Now head.Height = %d", ts.Height()) + + p, err = client.StateMinerPower(ctx, maddr, types.EmptyTSK) + require.NoError(t, err) + + require.Equal(t, p.MinerPower, p.TotalPower) + require.Equal(t, p.MinerPower.RawBytePower, types.NewInt(uint64(ssz)*uint64(sectors-1))) +} diff --git a/node/builder_miner.go b/node/builder_miner.go index 8c78091dc54..b72b237618b 100644 --- a/node/builder_miner.go +++ b/node/builder_miner.go @@ -117,7 +117,7 @@ func ConfigStorageMiner(c interface{}) Option { Override(new(*miner.Miner), modules.SetupBlockProducer), Override(new(gen.WinningPoStProver), storage.NewWinningPoStProver), Override(new(*storage.Miner), modules.StorageMiner(cfg.Fees)), - Override(new(*wdpost.WindowPoStScheduler), modules.WindowPostScheduler(cfg.Fees)), + Override(new(*wdpost.WindowPoStScheduler), modules.WindowPostScheduler(cfg.Fees, cfg.Proving)), Override(new(sectorblocks.SectorBuilder), From(new(*storage.Miner))), ), diff --git a/node/config/cfgdocgen/gen.go b/node/config/cfgdocgen/gen.go index 8d0efb65e6b..5133501523c 100644 --- a/node/config/cfgdocgen/gen.go +++ b/node/config/cfgdocgen/gen.go @@ -53,7 +53,7 @@ func run() error { continue } case stType: - if strings.HasPrefix(line, "// ") { + if strings.HasPrefix(line, "//") { cline := strings.TrimSpace(strings.TrimPrefix(line, "//")) currentComment = append(currentComment, cline) continue diff --git a/node/config/doc_gen.go b/node/config/doc_gen.go index 68d81f80273..8e35af782f5 100644 --- a/node/config/doc_gen.go +++ b/node/config/doc_gen.go @@ -34,7 +34,10 @@ var Doc = map[string][]DocField{ Name: "DisableMetadataLog", Type: "bool", - Comment: `Note that in case of metadata corruption it might be much harder to recover + Comment: `When set to true disables metadata log (.lotus/kvlog). This can save disk +space by reducing metadata redundancy. + +Note that in case of metadata corruption it might be much harder to recover your node if metadata log is disabled`, }, }, @@ -627,7 +630,64 @@ over the worker address if this flag is set.`, Name: "ParallelCheckLimit", Type: "int", - Comment: `Maximum number of sector checks to run in parallel. (0 = unlimited)`, + Comment: `Maximum number of sector checks to run in parallel. (0 = unlimited) + +WARNING: Setting this value too high may make the node crash by running out of stack +WARNING: Setting this value too low may make sector challenge reading much slower, resulting in failed PoSt due +to late submission. + +After changing this option, confirm that the new value works in your setup by invoking +'lotus-miner proving compute window-post 0'`, + }, + { + Name: "DisableBuiltinWindowPoSt", + Type: "bool", + + Comment: `Disable Window PoSt computation on the lotus-miner process even if no window PoSt workers are present. + +WARNING: If no windowPoSt workers are connected, window PoSt WILL FAIL resulting in faulty sectors which will need +to be recovered. Before enabling this option, make sure your PoSt workers work correctly. + +After changing this option, confirm that the new value works in your setup by invoking +'lotus-miner proving compute window-post 0'`, + }, + { + Name: "DisableBuiltinWinningPoSt", + Type: "bool", + + Comment: `Disable Winning PoSt computation on the lotus-miner process even if no winning PoSt workers are present. + +WARNING: If no WinningPoSt workers are connected, Winning PoSt WILL FAIL resulting in lost block rewards. +Before enabling this option, make sure your PoSt workers work correctly.`, + }, + { + Name: "DisableWDPoStPreChecks", + Type: "bool", + + Comment: `Disable WindowPoSt provable sector readability checks. + +In normal operation, when preparing to compute WindowPoSt, lotus-miner will perform a round of reading challenges +from all sectors to confirm that those sectors can be proven. Challenges read in this process are discarded, as +we're only interested in checking that sector data can be read. + +When using builtin proof computation (no PoSt workers, and DisableBuiltinWindowPoSt is set to false), this process +can save a lot of time and compute resources in the case that some sectors are not readable - this is caused by +the builtin logic not skipping snark computation when some sectors need to be skipped. + +When using PoSt workers, this process is mostly redundant, with PoSt workers challenges will be read once, and +if challenges for some sectors aren't readable, those sectors will just get skipped. + +Disabling sector pre-checks will slightly reduce IO load when proving sectors, possibly resulting in shorter +time to produce window PoSt. In setups with good IO capabilities the effect of this option on proving time should +be negligible. + +NOTE: It likely is a bad idea to disable sector pre-checks in setups with no PoSt workers. + +NOTE: Even when this option is enabled, recovering sectors will be checked before recovery declaration message is +sent to the chain + +After changing this option, confirm that the new value works in your setup by invoking +'lotus-miner proving compute window-post 0'`, }, }, "Pubsub": []DocField{ diff --git a/node/config/storage.go b/node/config/storage.go index 16f688075ee..99ca0df8708 100644 --- a/node/config/storage.go +++ b/node/config/storage.go @@ -67,6 +67,8 @@ func (c *StorageMiner) StorageManager() sealer.Config { Assigner: c.Storage.Assigner, - ParallelCheckLimit: c.Proving.ParallelCheckLimit, + ParallelCheckLimit: c.Proving.ParallelCheckLimit, + DisableBuiltinWindowPoSt: c.Proving.DisableBuiltinWindowPoSt, + DisableBuiltinWinningPoSt: c.Proving.DisableBuiltinWinningPoSt, } } diff --git a/node/config/types.go b/node/config/types.go index cd05c8ce5e7..f9272794226 100644 --- a/node/config/types.go +++ b/node/config/types.go @@ -221,9 +221,55 @@ type RetrievalPricingDefault struct { type ProvingConfig struct { // Maximum number of sector checks to run in parallel. (0 = unlimited) + // + // WARNING: Setting this value too high may make the node crash by running out of stack + // WARNING: Setting this value too low may make sector challenge reading much slower, resulting in failed PoSt due + // to late submission. + // + // After changing this option, confirm that the new value works in your setup by invoking + // 'lotus-miner proving compute window-post 0' ParallelCheckLimit int - // todo disable builtin post + // Disable Window PoSt computation on the lotus-miner process even if no window PoSt workers are present. + // + // WARNING: If no windowPoSt workers are connected, window PoSt WILL FAIL resulting in faulty sectors which will need + // to be recovered. Before enabling this option, make sure your PoSt workers work correctly. + // + // After changing this option, confirm that the new value works in your setup by invoking + // 'lotus-miner proving compute window-post 0' + DisableBuiltinWindowPoSt bool + + // Disable Winning PoSt computation on the lotus-miner process even if no winning PoSt workers are present. + // + // WARNING: If no WinningPoSt workers are connected, Winning PoSt WILL FAIL resulting in lost block rewards. + // Before enabling this option, make sure your PoSt workers work correctly. + DisableBuiltinWinningPoSt bool + + // Disable WindowPoSt provable sector readability checks. + // + // In normal operation, when preparing to compute WindowPoSt, lotus-miner will perform a round of reading challenges + // from all sectors to confirm that those sectors can be proven. Challenges read in this process are discarded, as + // we're only interested in checking that sector data can be read. + // + // When using builtin proof computation (no PoSt workers, and DisableBuiltinWindowPoSt is set to false), this process + // can save a lot of time and compute resources in the case that some sectors are not readable - this is caused by + // the builtin logic not skipping snark computation when some sectors need to be skipped. + // + // When using PoSt workers, this process is mostly redundant, with PoSt workers challenges will be read once, and + // if challenges for some sectors aren't readable, those sectors will just get skipped. + // + // Disabling sector pre-checks will slightly reduce IO load when proving sectors, possibly resulting in shorter + // time to produce window PoSt. In setups with good IO capabilities the effect of this option on proving time should + // be negligible. + // + // NOTE: It likely is a bad idea to disable sector pre-checks in setups with no PoSt workers. + // + // NOTE: Even when this option is enabled, recovering sectors will be checked before recovery declaration message is + // sent to the chain + // + // After changing this option, confirm that the new value works in your setup by invoking + // 'lotus-miner proving compute window-post 0' + DisableWDPoStPreChecks bool } type SealingConfig struct { diff --git a/node/modules/storageminer.go b/node/modules/storageminer.go index 3e538247771..94d36547454 100644 --- a/node/modules/storageminer.go +++ b/node/modules/storageminer.go @@ -254,7 +254,7 @@ func StorageMiner(fc config.MinerFeeConfig) func(params StorageMinerParams) (*st } } -func WindowPostScheduler(fc config.MinerFeeConfig) func(params StorageMinerParams) (*wdpost.WindowPoStScheduler, error) { +func WindowPostScheduler(fc config.MinerFeeConfig, pc config.ProvingConfig) func(params StorageMinerParams) (*wdpost.WindowPoStScheduler, error) { return func(params StorageMinerParams) (*wdpost.WindowPoStScheduler, error) { var ( mctx = params.MetricsCtx @@ -269,7 +269,7 @@ func WindowPostScheduler(fc config.MinerFeeConfig) func(params StorageMinerParam ctx := helpers.LifecycleCtx(mctx, lc) - fps, err := wdpost.NewWindowedPoStScheduler(api, fc, as, sealer, verif, sealer, j, maddr) + fps, err := wdpost.NewWindowedPoStScheduler(api, fc, pc, as, sealer, verif, sealer, j, maddr) if err != nil { return nil, err } diff --git a/storage/sealer/manager.go b/storage/sealer/manager.go index 47112cbc0f3..3e5e6032bb6 100644 --- a/storage/sealer/manager.go +++ b/storage/sealer/manager.go @@ -69,8 +69,10 @@ type Manager struct { workLk sync.Mutex work *statestore.StateStore - parallelCheckLimit int - disallowRemoteFinalize bool + parallelCheckLimit int + disableBuiltinWindowPoSt bool + disableBuiltinWinningPoSt bool + disallowRemoteFinalize bool callToWork map[storiface.CallID]WorkID // used when we get an early return and there's no callToWork mapping @@ -120,7 +122,9 @@ type Config struct { ResourceFiltering ResourceFilteringStrategy // PoSt config - ParallelCheckLimit int + ParallelCheckLimit int + DisableBuiltinWindowPoSt bool + DisableBuiltinWinningPoSt bool DisallowRemoteFinalize bool @@ -156,8 +160,10 @@ func New(ctx context.Context, lstor *paths.Local, stor paths.Store, ls paths.Loc localProver: prover, - parallelCheckLimit: sc.ParallelCheckLimit, - disallowRemoteFinalize: sc.DisallowRemoteFinalize, + parallelCheckLimit: sc.ParallelCheckLimit, + disableBuiltinWindowPoSt: sc.DisableBuiltinWindowPoSt, + disableBuiltinWinningPoSt: sc.DisableBuiltinWinningPoSt, + disallowRemoteFinalize: sc.DisallowRemoteFinalize, work: mss, callToWork: map[storiface.CallID]WorkID{}, diff --git a/storage/sealer/manager_post.go b/storage/sealer/manager_post.go index e88807a29f0..a4b812f8fb6 100644 --- a/storage/sealer/manager_post.go +++ b/storage/sealer/manager_post.go @@ -17,7 +17,9 @@ import ( ) func (m *Manager) GenerateWinningPoSt(ctx context.Context, minerID abi.ActorID, sectorInfo []proof.ExtendedSectorInfo, randomness abi.PoStRandomness) ([]proof.PoStProof, error) { - if !m.winningPoStSched.CanSched(ctx) { + if !m.disableBuiltinWinningPoSt && !m.winningPoStSched.CanSched(ctx) { + // if builtin PoSt isn't disabled, and there are no workers, compute the PoSt locally + log.Info("GenerateWinningPoSt run at lotus-miner") return m.localProver.GenerateWinningPoSt(ctx, minerID, sectorInfo, randomness) } @@ -76,7 +78,9 @@ func (m *Manager) generateWinningPoSt(ctx context.Context, minerID abi.ActorID, } func (m *Manager) GenerateWindowPoSt(ctx context.Context, minerID abi.ActorID, sectorInfo []proof.ExtendedSectorInfo, randomness abi.PoStRandomness) (proof []proof.PoStProof, skipped []abi.SectorID, err error) { - if !m.windowPoStSched.CanSched(ctx) { + if !m.disableBuiltinWindowPoSt && !m.windowPoStSched.CanSched(ctx) { + // if builtin PoSt isn't disabled, and there are no workers, compute the PoSt locally + log.Info("GenerateWindowPoSt run at lotus-miner") return m.localProver.GenerateWindowPoSt(ctx, minerID, sectorInfo, randomness) } @@ -230,11 +234,9 @@ func (m *Manager) generatePartitionWindowPost(ctx context.Context, spt abi.Regis } func (m *Manager) GenerateWinningPoStWithVanilla(ctx context.Context, proofType abi.RegisteredPoStProof, minerID abi.ActorID, randomness abi.PoStRandomness, proofs [][]byte) ([]proof.PoStProof, error) { - //TODO implement me - panic("implement me") + panic("worker-level api shouldn't be called at this level") } func (m *Manager) GenerateWindowPoStWithVanilla(ctx context.Context, proofType abi.RegisteredPoStProof, minerID abi.ActorID, randomness abi.PoStRandomness, proofs [][]byte, partitionIdx int) (proof.PoStProof, error) { - //TODO implement me - panic("implement me") + panic("worker-level api shouldn't be called at this level") } diff --git a/storage/wdpost/wdpost_run.go b/storage/wdpost/wdpost_run.go index d77181f4bbd..85464ce2ddf 100644 --- a/storage/wdpost/wdpost_run.go +++ b/storage/wdpost/wdpost_run.go @@ -241,275 +241,6 @@ func (s *WindowPoStScheduler) checkSectors(ctx context.Context, check bitfield.B return sbf, nil } -// declareRecoveries identifies sectors that were previously marked as faulty -// for our miner, but are now recovered (i.e. are now provable again) and -// still not reported as such. -// -// It then reports the recovery on chain via a `DeclareFaultsRecovered` -// message to our miner actor. -// -// This is always invoked ahead of time, before the deadline for the evaluated -// sectors arrives. That way, recoveries are declared in preparation for those -// sectors to be proven. -// -// If a declaration is made, it awaits for build.MessageConfidence confirmations -// on chain before returning. -// -// TODO: the waiting should happen in the background. Right now this -// is blocking/delaying the actual generation and submission of WindowPoSts in -// this deadline! -func (s *WindowPoStScheduler) declareRecoveries(ctx context.Context, dlIdx uint64, partitions []api.Partition, tsk types.TipSetKey) ([]miner.RecoveryDeclaration, *types.SignedMessage, error) { - ctx, span := trace.StartSpan(ctx, "storage.declareRecoveries") - defer span.End() - - faulty := uint64(0) - params := &miner.DeclareFaultsRecoveredParams{ - Recoveries: []miner.RecoveryDeclaration{}, - } - - for partIdx, partition := range partitions { - unrecovered, err := bitfield.SubtractBitField(partition.FaultySectors, partition.RecoveringSectors) - if err != nil { - return nil, nil, xerrors.Errorf("subtracting recovered set from fault set: %w", err) - } - - uc, err := unrecovered.Count() - if err != nil { - return nil, nil, xerrors.Errorf("counting unrecovered sectors: %w", err) - } - - if uc == 0 { - continue - } - - faulty += uc - - recovered, err := s.checkSectors(ctx, unrecovered, tsk) - if err != nil { - return nil, nil, xerrors.Errorf("checking unrecovered sectors: %w", err) - } - - // if all sectors failed to recover, don't declare recoveries - recoveredCount, err := recovered.Count() - if err != nil { - return nil, nil, xerrors.Errorf("counting recovered sectors: %w", err) - } - - if recoveredCount == 0 { - continue - } - - params.Recoveries = append(params.Recoveries, miner.RecoveryDeclaration{ - Deadline: dlIdx, - Partition: uint64(partIdx), - Sectors: recovered, - }) - } - - recoveries := params.Recoveries - if len(recoveries) == 0 { - if faulty != 0 { - log.Warnw("No recoveries to declare", "deadline", dlIdx, "faulty", faulty) - } - - return recoveries, nil, nil - } - - enc, aerr := actors.SerializeParams(params) - if aerr != nil { - return recoveries, nil, xerrors.Errorf("could not serialize declare recoveries parameters: %w", aerr) - } - - msg := &types.Message{ - To: s.actor, - Method: builtin.MethodsMiner.DeclareFaultsRecovered, - Params: enc, - Value: types.NewInt(0), - } - spec := &api.MessageSendSpec{MaxFee: abi.TokenAmount(s.feeCfg.MaxWindowPoStGasFee)} - if err := s.prepareMessage(ctx, msg, spec); err != nil { - return recoveries, nil, err - } - - sm, err := s.api.MpoolPushMessage(ctx, msg, &api.MessageSendSpec{MaxFee: abi.TokenAmount(s.feeCfg.MaxWindowPoStGasFee)}) - if err != nil { - return recoveries, sm, xerrors.Errorf("pushing message to mpool: %w", err) - } - - log.Warnw("declare faults recovered Message CID", "cid", sm.Cid()) - - rec, err := s.api.StateWaitMsg(context.TODO(), sm.Cid(), build.MessageConfidence, api.LookbackNoLimit, true) - if err != nil { - return recoveries, sm, xerrors.Errorf("declare faults recovered wait error: %w", err) - } - - if rec.Receipt.ExitCode != 0 { - return recoveries, sm, xerrors.Errorf("declare faults recovered wait non-0 exit code: %d", rec.Receipt.ExitCode) - } - - return recoveries, sm, nil -} - -// declareFaults identifies the sectors on the specified proving deadline that -// are faulty, and reports the faults on chain via the `DeclareFaults` message -// to our miner actor. -// -// This is always invoked ahead of time, before the deadline for the evaluated -// sectors arrives. That way, faults are declared before a penalty is accrued. -// -// If a declaration is made, it awaits for build.MessageConfidence confirmations -// on chain before returning. -// -// TODO: the waiting should happen in the background. Right now this -// is blocking/delaying the actual generation and submission of WindowPoSts in -// this deadline! -func (s *WindowPoStScheduler) declareFaults(ctx context.Context, dlIdx uint64, partitions []api.Partition, tsk types.TipSetKey) ([]miner.FaultDeclaration, *types.SignedMessage, error) { - ctx, span := trace.StartSpan(ctx, "storage.declareFaults") - defer span.End() - - bad := uint64(0) - params := &miner.DeclareFaultsParams{ - Faults: []miner.FaultDeclaration{}, - } - - for partIdx, partition := range partitions { - nonFaulty, err := bitfield.SubtractBitField(partition.LiveSectors, partition.FaultySectors) - if err != nil { - return nil, nil, xerrors.Errorf("determining non faulty sectors: %w", err) - } - - good, err := s.checkSectors(ctx, nonFaulty, tsk) - if err != nil { - return nil, nil, xerrors.Errorf("checking sectors: %w", err) - } - - newFaulty, err := bitfield.SubtractBitField(nonFaulty, good) - if err != nil { - return nil, nil, xerrors.Errorf("calculating faulty sector set: %w", err) - } - - c, err := newFaulty.Count() - if err != nil { - return nil, nil, xerrors.Errorf("counting faulty sectors: %w", err) - } - - if c == 0 { - continue - } - - bad += c - - params.Faults = append(params.Faults, miner.FaultDeclaration{ - Deadline: dlIdx, - Partition: uint64(partIdx), - Sectors: newFaulty, - }) - } - - faults := params.Faults - if len(faults) == 0 { - return faults, nil, nil - } - - log.Errorw("DETECTED FAULTY SECTORS, declaring faults", "count", bad) - - enc, aerr := actors.SerializeParams(params) - if aerr != nil { - return faults, nil, xerrors.Errorf("could not serialize declare faults parameters: %w", aerr) - } - - msg := &types.Message{ - To: s.actor, - Method: builtin.MethodsMiner.DeclareFaults, - Params: enc, - Value: types.NewInt(0), // TODO: Is there a fee? - } - spec := &api.MessageSendSpec{MaxFee: abi.TokenAmount(s.feeCfg.MaxWindowPoStGasFee)} - if err := s.prepareMessage(ctx, msg, spec); err != nil { - return faults, nil, err - } - - sm, err := s.api.MpoolPushMessage(ctx, msg, spec) - if err != nil { - return faults, sm, xerrors.Errorf("pushing message to mpool: %w", err) - } - - log.Warnw("declare faults Message CID", "cid", sm.Cid()) - - rec, err := s.api.StateWaitMsg(context.TODO(), sm.Cid(), build.MessageConfidence, api.LookbackNoLimit, true) - if err != nil { - return faults, sm, xerrors.Errorf("declare faults wait error: %w", err) - } - - if rec.Receipt.ExitCode != 0 { - return faults, sm, xerrors.Errorf("declare faults wait non-0 exit code: %d", rec.Receipt.ExitCode) - } - - return faults, sm, nil -} - -func (s *WindowPoStScheduler) asyncFaultRecover(di dline.Info, ts *types.TipSet) { - go func() { - // check faults / recoveries for the *next* deadline. It's already too - // late to declare them for this deadline - declDeadline := (di.Index + 2) % di.WPoStPeriodDeadlines - - partitions, err := s.api.StateMinerPartitions(context.TODO(), s.actor, declDeadline, ts.Key()) - if err != nil { - log.Errorf("getting partitions: %v", err) - return - } - - var ( - sigmsg *types.SignedMessage - recoveries []miner.RecoveryDeclaration - faults []miner.FaultDeclaration - - // optionalCid returns the CID of the message, or cid.Undef is the - // message is nil. We don't need the argument (could capture the - // pointer), but it's clearer and purer like that. - optionalCid = func(sigmsg *types.SignedMessage) cid.Cid { - if sigmsg == nil { - return cid.Undef - } - return sigmsg.Cid() - } - ) - - if recoveries, sigmsg, err = s.declareRecoveries(context.TODO(), declDeadline, partitions, ts.Key()); err != nil { - // TODO: This is potentially quite bad, but not even trying to post when this fails is objectively worse - log.Errorf("checking sector recoveries: %v", err) - } - - s.journal.RecordEvent(s.evtTypes[evtTypeWdPoStRecoveries], func() interface{} { - j := WdPoStRecoveriesProcessedEvt{ - evtCommon: s.getEvtCommon(err), - Declarations: recoveries, - MessageCID: optionalCid(sigmsg), - } - j.Error = err - return j - }) - - if ts.Height() > build.UpgradeIgnitionHeight { - return // FORK: declaring faults after ignition upgrade makes no sense - } - - if faults, sigmsg, err = s.declareFaults(context.TODO(), declDeadline, partitions, ts.Key()); err != nil { - // TODO: This is also potentially really bad, but we try to post anyways - log.Errorf("checking sector faults: %v", err) - } - - s.journal.RecordEvent(s.evtTypes[evtTypeWdPoStFaults], func() interface{} { - return WdPoStFaultsProcessedEvt{ - evtCommon: s.getEvtCommon(err), - Declarations: faults, - MessageCID: optionalCid(sigmsg), - } - }) - }() -} - // runPoStCycle runs a full cycle of the PoSt process: // // 1. performs recovery declarations for the next deadline. @@ -604,9 +335,15 @@ func (s *WindowPoStScheduler) runPoStCycle(ctx context.Context, manual bool, di return nil, xerrors.Errorf("adding recoveries to set of sectors to prove: %w", err) } - good, err := s.checkSectors(ctx, toProve, ts.Key()) + good, err := toProve.Copy() if err != nil { - return nil, xerrors.Errorf("checking sectors to skip: %w", err) + return nil, xerrors.Errorf("copy toProve: %w", err) + } + if !s.disablePreChecks { + good, err = s.checkSectors(ctx, toProve, ts.Key()) + if err != nil { + return nil, xerrors.Errorf("checking sectors to skip: %w", err) + } } good, err = bitfield.SubtractBitField(good, postSkipped) diff --git a/storage/wdpost/wdpost_run_faults.go b/storage/wdpost/wdpost_run_faults.go new file mode 100644 index 00000000000..0eda986fbd0 --- /dev/null +++ b/storage/wdpost/wdpost_run_faults.go @@ -0,0 +1,291 @@ +package wdpost + +import ( + "context" + + "github.com/ipfs/go-cid" + "go.opencensus.io/trace" + "golang.org/x/xerrors" + + "github.com/filecoin-project/go-bitfield" + "github.com/filecoin-project/go-state-types/abi" + "github.com/filecoin-project/go-state-types/builtin" + "github.com/filecoin-project/go-state-types/builtin/v8/miner" + "github.com/filecoin-project/go-state-types/dline" + + "github.com/filecoin-project/lotus/api" + "github.com/filecoin-project/lotus/build" + "github.com/filecoin-project/lotus/chain/actors" + "github.com/filecoin-project/lotus/chain/types" +) + +// declareRecoveries identifies sectors that were previously marked as faulty +// for our miner, but are now recovered (i.e. are now provable again) and +// still not reported as such. +// +// It then reports the recovery on chain via a `DeclareFaultsRecovered` +// message to our miner actor. +// +// This is always invoked ahead of time, before the deadline for the evaluated +// sectors arrives. That way, recoveries are declared in preparation for those +// sectors to be proven. +// +// If a declaration is made, it awaits for build.MessageConfidence confirmations +// on chain before returning. +// +// TODO: the waiting should happen in the background. Right now this +// is blocking/delaying the actual generation and submission of WindowPoSts in +// this deadline! +func (s *WindowPoStScheduler) declareRecoveries(ctx context.Context, dlIdx uint64, partitions []api.Partition, tsk types.TipSetKey) ([]miner.RecoveryDeclaration, *types.SignedMessage, error) { + ctx, span := trace.StartSpan(ctx, "storage.declareRecoveries") + defer span.End() + + faulty := uint64(0) + params := &miner.DeclareFaultsRecoveredParams{ + Recoveries: []miner.RecoveryDeclaration{}, + } + + for partIdx, partition := range partitions { + unrecovered, err := bitfield.SubtractBitField(partition.FaultySectors, partition.RecoveringSectors) + if err != nil { + return nil, nil, xerrors.Errorf("subtracting recovered set from fault set: %w", err) + } + + uc, err := unrecovered.Count() + if err != nil { + return nil, nil, xerrors.Errorf("counting unrecovered sectors: %w", err) + } + + if uc == 0 { + continue + } + + faulty += uc + + recovered, err := s.checkSectors(ctx, unrecovered, tsk) + if err != nil { + return nil, nil, xerrors.Errorf("checking unrecovered sectors: %w", err) + } + + // if all sectors failed to recover, don't declare recoveries + recoveredCount, err := recovered.Count() + if err != nil { + return nil, nil, xerrors.Errorf("counting recovered sectors: %w", err) + } + + if recoveredCount == 0 { + continue + } + + params.Recoveries = append(params.Recoveries, miner.RecoveryDeclaration{ + Deadline: dlIdx, + Partition: uint64(partIdx), + Sectors: recovered, + }) + } + + recoveries := params.Recoveries + if len(recoveries) == 0 { + if faulty != 0 { + log.Warnw("No recoveries to declare", "deadline", dlIdx, "faulty", faulty) + } + + return recoveries, nil, nil + } + + enc, aerr := actors.SerializeParams(params) + if aerr != nil { + return recoveries, nil, xerrors.Errorf("could not serialize declare recoveries parameters: %w", aerr) + } + + msg := &types.Message{ + To: s.actor, + Method: builtin.MethodsMiner.DeclareFaultsRecovered, + Params: enc, + Value: types.NewInt(0), + } + spec := &api.MessageSendSpec{MaxFee: abi.TokenAmount(s.feeCfg.MaxWindowPoStGasFee)} + if err := s.prepareMessage(ctx, msg, spec); err != nil { + return recoveries, nil, err + } + + sm, err := s.api.MpoolPushMessage(ctx, msg, &api.MessageSendSpec{MaxFee: abi.TokenAmount(s.feeCfg.MaxWindowPoStGasFee)}) + if err != nil { + return recoveries, sm, xerrors.Errorf("pushing message to mpool: %w", err) + } + + log.Warnw("declare faults recovered Message CID", "cid", sm.Cid()) + + rec, err := s.api.StateWaitMsg(context.TODO(), sm.Cid(), build.MessageConfidence, api.LookbackNoLimit, true) + if err != nil { + return recoveries, sm, xerrors.Errorf("declare faults recovered wait error: %w", err) + } + + if rec.Receipt.ExitCode != 0 { + return recoveries, sm, xerrors.Errorf("declare faults recovered wait non-0 exit code: %d", rec.Receipt.ExitCode) + } + + return recoveries, sm, nil +} + +// declareFaults identifies the sectors on the specified proving deadline that +// are faulty, and reports the faults on chain via the `DeclareFaults` message +// to our miner actor. +// +// NOTE: THIS CODE ISN'T INVOKED AFTER THE IGNITION UPGRADE +// +// This is always invoked ahead of time, before the deadline for the evaluated +// sectors arrives. That way, faults are declared before a penalty is accrued. +// +// If a declaration is made, it awaits for build.MessageConfidence confirmations +// on chain before returning. +// +// TODO: the waiting should happen in the background. Right now this +// is blocking/delaying the actual generation and submission of WindowPoSts in +// this deadline! +func (s *WindowPoStScheduler) declareFaults(ctx context.Context, dlIdx uint64, partitions []api.Partition, tsk types.TipSetKey) ([]miner.FaultDeclaration, *types.SignedMessage, error) { + ctx, span := trace.StartSpan(ctx, "storage.declareFaults") + defer span.End() + + bad := uint64(0) + params := &miner.DeclareFaultsParams{ + Faults: []miner.FaultDeclaration{}, + } + + for partIdx, partition := range partitions { + nonFaulty, err := bitfield.SubtractBitField(partition.LiveSectors, partition.FaultySectors) + if err != nil { + return nil, nil, xerrors.Errorf("determining non faulty sectors: %w", err) + } + + good, err := s.checkSectors(ctx, nonFaulty, tsk) + if err != nil { + return nil, nil, xerrors.Errorf("checking sectors: %w", err) + } + + newFaulty, err := bitfield.SubtractBitField(nonFaulty, good) + if err != nil { + return nil, nil, xerrors.Errorf("calculating faulty sector set: %w", err) + } + + c, err := newFaulty.Count() + if err != nil { + return nil, nil, xerrors.Errorf("counting faulty sectors: %w", err) + } + + if c == 0 { + continue + } + + bad += c + + params.Faults = append(params.Faults, miner.FaultDeclaration{ + Deadline: dlIdx, + Partition: uint64(partIdx), + Sectors: newFaulty, + }) + } + + faults := params.Faults + if len(faults) == 0 { + return faults, nil, nil + } + + log.Errorw("DETECTED FAULTY SECTORS, declaring faults", "count", bad) + + enc, aerr := actors.SerializeParams(params) + if aerr != nil { + return faults, nil, xerrors.Errorf("could not serialize declare faults parameters: %w", aerr) + } + + msg := &types.Message{ + To: s.actor, + Method: builtin.MethodsMiner.DeclareFaults, + Params: enc, + Value: types.NewInt(0), // TODO: Is there a fee? + } + spec := &api.MessageSendSpec{MaxFee: abi.TokenAmount(s.feeCfg.MaxWindowPoStGasFee)} + if err := s.prepareMessage(ctx, msg, spec); err != nil { + return faults, nil, err + } + + sm, err := s.api.MpoolPushMessage(ctx, msg, spec) + if err != nil { + return faults, sm, xerrors.Errorf("pushing message to mpool: %w", err) + } + + log.Warnw("declare faults Message CID", "cid", sm.Cid()) + + rec, err := s.api.StateWaitMsg(context.TODO(), sm.Cid(), build.MessageConfidence, api.LookbackNoLimit, true) + if err != nil { + return faults, sm, xerrors.Errorf("declare faults wait error: %w", err) + } + + if rec.Receipt.ExitCode != 0 { + return faults, sm, xerrors.Errorf("declare faults wait non-0 exit code: %d", rec.Receipt.ExitCode) + } + + return faults, sm, nil +} + +func (s *WindowPoStScheduler) asyncFaultRecover(di dline.Info, ts *types.TipSet) { + go func() { + // check faults / recoveries for the *next* deadline. It's already too + // late to declare them for this deadline + declDeadline := (di.Index + 2) % di.WPoStPeriodDeadlines + + partitions, err := s.api.StateMinerPartitions(context.TODO(), s.actor, declDeadline, ts.Key()) + if err != nil { + log.Errorf("getting partitions: %v", err) + return + } + + var ( + sigmsg *types.SignedMessage + recoveries []miner.RecoveryDeclaration + faults []miner.FaultDeclaration + + // optionalCid returns the CID of the message, or cid.Undef is the + // message is nil. We don't need the argument (could capture the + // pointer), but it's clearer and purer like that. + optionalCid = func(sigmsg *types.SignedMessage) cid.Cid { + if sigmsg == nil { + return cid.Undef + } + return sigmsg.Cid() + } + ) + + if recoveries, sigmsg, err = s.declareRecoveries(context.TODO(), declDeadline, partitions, ts.Key()); err != nil { + // TODO: This is potentially quite bad, but not even trying to post when this fails is objectively worse + log.Errorf("checking sector recoveries: %v", err) + } + + s.journal.RecordEvent(s.evtTypes[evtTypeWdPoStRecoveries], func() interface{} { + j := WdPoStRecoveriesProcessedEvt{ + evtCommon: s.getEvtCommon(err), + Declarations: recoveries, + MessageCID: optionalCid(sigmsg), + } + j.Error = err + return j + }) + + if ts.Height() > build.UpgradeIgnitionHeight { + return // FORK: declaring faults after ignition upgrade makes no sense + } + + if faults, sigmsg, err = s.declareFaults(context.TODO(), declDeadline, partitions, ts.Key()); err != nil { + // TODO: This is also potentially really bad, but we try to post anyways + log.Errorf("checking sector faults: %v", err) + } + + s.journal.RecordEvent(s.evtTypes[evtTypeWdPoStFaults], func() interface{} { + return WdPoStFaultsProcessedEvt{ + evtCommon: s.getEvtCommon(err), + Declarations: faults, + MessageCID: optionalCid(sigmsg), + } + }) + }() +} diff --git a/storage/wdpost/wdpost_sched.go b/storage/wdpost/wdpost_sched.go index 6ac77d9ba91..66e8f9f1866 100644 --- a/storage/wdpost/wdpost_sched.go +++ b/storage/wdpost/wdpost_sched.go @@ -70,6 +70,7 @@ type WindowPoStScheduler struct { faultTracker sealer.FaultTracker proofType abi.RegisteredPoStProof partitionSectors uint64 + disablePreChecks bool ch *changeHandler actor address.Address @@ -84,6 +85,7 @@ type WindowPoStScheduler struct { // NewWindowedPoStScheduler creates a new WindowPoStScheduler scheduler. func NewWindowedPoStScheduler(api NodeAPI, cfg config.MinerFeeConfig, + pcfg config.ProvingConfig, as *ctladdr.AddressSelector, sp storiface.ProverPoSt, verif storiface.Verifier, @@ -104,6 +106,7 @@ func NewWindowedPoStScheduler(api NodeAPI, faultTracker: ft, proofType: mi.WindowPoStProofType, partitionSectors: mi.WindowPoStPartitionSectors, + disablePreChecks: pcfg.DisableWDPoStPreChecks, actor: actor, evtTypes: [...]journal.EventType{