Skip to content

Commit

Permalink
tortoise: tracer to debug execution interactively (#4447)
Browse files Browse the repository at this point in the history
tortoise provides EnableTracer option, if enabled it will dump to stderr all inputs and outputs from tortoise consensus using zap library.

this logs will be available in cloud and can be downloaded either with grafana or logcli. if downloaded from grafana, next post processing script is useful:
> jq -c '.[].line | fromjson | {"t": .t, "o": .o}' ./EXAMPLE.json

note that traces are verbose and should be decreased in size, as an example i added tortoise/data/failed_nodes.json from one of the nodes.

once trace is collected it can be used as a regression test, and also for interactive debugging. for interactive debugging:
1. install delve https://github.com/go-delve/delve/tree/master/Documentation/installation
2. run delve with selected trace `dlv exec ./trace -- -breakpoint -level=debug ./tortoise/data/failed_nodes.json`
3. continue will stop after every step, debug log will also print what happened during that step
```
> (dlv) continue
2023-05-30T11:42:03.864+0200	DEBUG	trace	using trace	{"path": "./tortoise/data/failed_nodes.json", "name": ""}
> [hardcoded-breakpoint] github.com/spacemeshos/go-spacemesh/tortoise.RunTrace() ./tortoise/tracer.go:101 (hits total:0) (PC: 0xdf038f)
Warning: debugging optimized function
    96:				return err
    97:			}
    98:			if err := ev.Run(runner); err != nil {
    99:				return err
   100:			}
=> 101:			if breakpoint != nil {
   102:				breakpoint()
   103:			}
   104:		}
   105:	}
   106:	
```
  • Loading branch information
dshulyak committed Jun 1, 2023
1 parent 6325458 commit bb80c6e
Show file tree
Hide file tree
Showing 8 changed files with 711 additions and 19 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ temp.js
/go-hare
/cp.out
/cover-all.out
/trace
cover.out
coverage.html
gcloud.json
Expand Down
10 changes: 8 additions & 2 deletions cmd/node/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -602,10 +602,16 @@ func (app *App) initServices(ctx context.Context, poetClients []activation.PoetP
if trtlCfg.BadBeaconVoteDelayLayers == 0 {
trtlCfg.BadBeaconVoteDelayLayers = app.Config.LayersPerEpoch
}
trtl, err := tortoise.Recover(app.cachedDB, beaconProtocol,
tortoise.WithContext(ctx),
trtlopts := []tortoise.Opt{
tortoise.WithLogger(app.addLogger(TrtlLogger, lg)),
tortoise.WithConfig(trtlCfg),
}
if trtlCfg.EnableTracer {
app.log.With().Info("tortoise will trace execution")
trtlopts = append(trtlopts, tortoise.WithTracer())
}
trtl, err := tortoise.Recover(
app.cachedDB, beaconProtocol, trtlopts...,
)
if err != nil {
return fmt.Errorf("can't recover tortoise state: %w", err)
Expand Down
4 changes: 2 additions & 2 deletions systest/parameters/fastnet/smesher.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
"cycle-gap": "30s",
"grace-period": "10s"
},
"logging": {
"trtl": "debug"
"tortoise": {
"tortoise-enable-tracer": true
},
"api": {
"grpc-public-services": [
Expand Down
109 changes: 97 additions & 12 deletions tortoise/algorithm.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ type Config struct {
MaxExceptions int `mapstructure:"tortoise-max-exceptions"` // if candidate for base ballot has more than max exceptions it will be ignored
// number of layers to delay votes for blocks with bad beacon values during self-healing. ideally a full epoch.
BadBeaconVoteDelayLayers uint32 `mapstructure:"tortoise-delay-layers"`
// EnableTracer will write tortoise traces to the stderr.
EnableTracer bool `mapstructure:"tortoise-enable-tracer"`

LayerSize uint32
}
Expand All @@ -42,8 +44,9 @@ type Tortoise struct {
ctx context.Context
cfg Config

mu sync.Mutex
trtl *turtle
mu sync.Mutex
trtl *turtle
tracer *tracer
}

// Opt for configuring tortoise.
Expand All @@ -56,17 +59,17 @@ func WithLogger(logger log.Log) Opt {
}
}

// WithContext defines context for tortoise.
func WithContext(ctx context.Context) Opt {
// WithConfig defines protocol parameters.
func WithConfig(cfg Config) Opt {
return func(t *Tortoise) {
t.ctx = ctx
t.cfg = cfg
}
}

// WithConfig defines protocol parameters.
func WithConfig(cfg Config) Opt {
// WithTracer enables tracing of every call to the tortoise.
func WithTracer(opts ...TraceOpt) Opt {
return func(t *Tortoise) {
t.cfg = cfg
t.tracer = newTracer(opts...)
}
}

Expand All @@ -87,6 +90,17 @@ func New(opts ...Opt) (*Tortoise, error) {
)
}
t.trtl = newTurtle(t.logger, t.cfg)
if t.tracer != nil {
t.tracer.On(&ConfigTrace{
Hdist: t.cfg.Hdist,
Zdist: t.cfg.Zdist,
WindowSize: t.cfg.WindowSize,
MaxExceptions: uint32(t.cfg.MaxExceptions),
BadBeaconVoteDelayLayers: t.cfg.BadBeaconVoteDelayLayers,
LayerSize: t.cfg.LayerSize,
EpochSize: types.GetLayersPerEpoch(),
})
}
return t, nil
}

Expand Down Expand Up @@ -114,6 +128,9 @@ func (t *Tortoise) OnWeakCoin(lid types.LayerID, coin bool) {
} else {
layer.coinflip = against
}
if t.tracer != nil {
t.tracer.On(&WeakCoinTrace{Layer: lid, Coin: coin})
}
}

func (t *Tortoise) OnBeacon(eid types.EpochID, beacon types.Beacon) {
Expand All @@ -130,6 +147,9 @@ func (t *Tortoise) OnBeacon(eid types.EpochID, beacon types.Beacon) {
}
epoch := t.trtl.epoch(eid)
epoch.beacon = &beacon
if t.tracer != nil {
t.tracer.On(&BeaconTrace{Epoch: eid, Beacon: beacon})
}
}

type encodeConf struct {
Expand Down Expand Up @@ -167,6 +187,20 @@ func (t *Tortoise) EncodeVotes(ctx context.Context, opts ...EncodeVotesOpts) (*t
if err != nil {
errorsCounter.Inc()
}
if t.tracer != nil {
event := &EncodeVotesTrace{
Opinion: opinion,
}
if err != nil {
event.Error = err.Error()
}
if conf.current != nil {
event.Layer = *conf.current
} else {
event.Layer = t.trtl.last + 1
}
t.tracer.On(event)
}
return opinion, err
}

Expand All @@ -179,15 +213,21 @@ func (t *Tortoise) TallyVotes(ctx context.Context, lid types.LayerID) {
start = time.Now()
t.trtl.onLayer(ctx, lid)
executeTallyVotes.Observe(float64(time.Since(start).Nanoseconds()))
if t.tracer != nil {
t.tracer.On(&TallyTrace{Layer: lid})
}
}

// OnAtx is expected to be called before ballots that use this atx.
func (t *Tortoise) OnAtx(atx *types.ActivationTxHeader) {
func (t *Tortoise) OnAtx(header *types.ActivationTxHeader) {
start := time.Now()
t.mu.Lock()
defer t.mu.Unlock()
waitAtxDuration.Observe(float64(time.Since(start).Nanoseconds()))
t.trtl.onAtx(atx)
t.trtl.onAtx(header)
if t.tracer != nil {
t.tracer.On(&AtxTrace{Header: header})
}
}

// OnBlock updates tortoise with information that data is available locally.
Expand All @@ -197,6 +237,9 @@ func (t *Tortoise) OnBlock(header types.BlockHeader) {
defer t.mu.Unlock()
waitBlockDuration.Observe(float64(time.Since(start).Nanoseconds()))
t.trtl.onBlock(header, true, false)
if t.tracer != nil {
t.tracer.On(&BlockTrace{Header: header})
}
}

// OnValidBlock inserts block, updates that data is stored locally
Expand All @@ -207,6 +250,9 @@ func (t *Tortoise) OnValidBlock(header types.BlockHeader) {
defer t.mu.Unlock()
waitBlockDuration.Observe(float64(time.Since(start).Nanoseconds()))
t.trtl.onBlock(header, true, true)
if t.tracer != nil {
t.tracer.On(&BlockTrace{Header: header, Valid: true})
}
}

// OnBallot should be called every time new ballot is received.
Expand All @@ -215,10 +261,14 @@ func (t *Tortoise) OnValidBlock(header types.BlockHeader) {
func (t *Tortoise) OnBallot(ballot *types.Ballot) {
t.mu.Lock()
defer t.mu.Unlock()
if err := t.trtl.onBallot(ballot); err != nil {
err := t.trtl.onBallot(ballot)
if err != nil {
errorsCounter.Inc()
t.logger.With().Error("failed to save state from ballot", ballot.ID(), log.Err(err))
}
if t.tracer != nil {
t.tracer.On(&BallotTrace{Ballot: ballot, Malicious: ballot.IsMalicious()})
}
}

// DecodedBallot created after unwrapping exceptions list and computing internal opinion.
Expand All @@ -237,6 +287,18 @@ func (t *Tortoise) DecodeBallot(ballot *types.Ballot) (*DecodedBallot, error) {
t.mu.Lock()
defer t.mu.Unlock()
waitBallotDuration.Observe(float64(time.Since(start).Nanoseconds()))
decoded, err := t.decodeBallot(ballot)
if t.tracer != nil {
ev := &DecodeBallotTrace{Ballot: ballot, Malicious: ballot.IsMalicious()}
if err != nil {
ev.Error = err.Error()
}
t.tracer.On(ev)
}
return decoded, err
}

func (t *Tortoise) decodeBallot(ballot *types.Ballot) (*DecodedBallot, error) {
info, min, err := t.trtl.decodeBallot(ballot)
if err != nil {
errorsCounter.Inc()
Expand Down Expand Up @@ -266,6 +328,9 @@ func (t *Tortoise) StoreBallot(decoded *DecodedBallot) error {
decoded.info.malicious = true
}
t.trtl.storeBallot(decoded.info, decoded.minHint)
if t.tracer != nil {
t.tracer.On(&StoreBallotTrace{ID: decoded.ID(), Malicious: decoded.IsMalicious()})
}
return nil
}

Expand All @@ -281,6 +346,9 @@ func (t *Tortoise) OnHareOutput(lid types.LayerID, bid types.BlockID) {
defer t.mu.Unlock()
waitHareOutputDuration.Observe(float64(time.Since(start).Nanoseconds()))
t.trtl.onHareOutput(lid, bid)
if t.tracer != nil {
t.tracer.On(&HareTrace{Layer: lid, Vote: bid})
}
}

// GetMissingActiveSet returns unknown atxs from the original list. It is done for a specific epoch
Expand Down Expand Up @@ -318,14 +386,31 @@ func (t *Tortoise) Updates() []result.Layer {
)
}
t.trtl.pending = 0
if t.tracer != nil {
t.tracer.On(&UpdatesTrace{ResultsTrace{
From: t.trtl.pending, To: t.trtl.processed,
Results: rst,
}})
}
return rst
}

// Results returns layers that crossed threshold in range [from, to].
func (t *Tortoise) Results(from, to types.LayerID) ([]result.Layer, error) {
t.mu.Lock()
defer t.mu.Unlock()
return t.results(from, to)
rst, err := t.results(from, to)
if t.tracer != nil {
ev := &ResultsTrace{
From: from, To: to,
Results: rst,
}
if err != nil {
ev.Error = err.Error()
}
t.tracer.On(ev)
}
return rst, err
}

func (t *Tortoise) results(from, to types.LayerID) ([]result.Layer, error) {
Expand Down
25 changes: 22 additions & 3 deletions tortoise/tortoise_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"fmt"
"math/rand"
"os"
"path/filepath"
"testing"
"time"

Expand Down Expand Up @@ -868,7 +869,7 @@ func TestDecodeVotes(t *testing.T) {
hasher.WriteSupport(supported, 0)
ballot.OpinionHash = hasher.Hash()
ballot.Votes.Support = []types.Vote{{ID: supported, LayerID: ballot.Layer - 1}}
_, err = tortoise.DecodeBallot(&ballot)
_, err = tortoise.decodeBallot(&ballot)
require.NoError(t, err)
})
}
Expand Down Expand Up @@ -2779,7 +2780,7 @@ func TestEncodeVotes(t *testing.T) {
hasher.WriteSupport(block.ID(), block.TickHeight)
hasher.Sum(ballot.OpinionHash[:0])

decoded, err := tortoise.DecodeBallot(&ballot)
decoded, err := tortoise.decodeBallot(&ballot)
require.NoError(t, err)
require.NoError(t, tortoise.StoreBallot(decoded))

Expand Down Expand Up @@ -2858,7 +2859,7 @@ func TestBaseBallotBeforeCurrentLayer(t *testing.T) {
ballot.InnerBallot = ballots[0].InnerBallot
ballot.EligibilityProofs = ballots[0].EligibilityProofs
ballot.Votes.Base = ballots[1].ID()
_, err = tortoise.DecodeBallot(&ballot)
_, err = tortoise.decodeBallot(&ballot)
require.ErrorContains(t, err, "votes for ballot")
})
}
Expand Down Expand Up @@ -3053,3 +3054,21 @@ func TestUpdates(t *testing.T) {
require.Equal(t, id, updates[0].Blocks[0].Header.ID)
})
}

func TestData(t *testing.T) {
t.Skip()
t.Parallel()
data, err := filepath.Abs("./data")
require.NoError(t, err)

entries, err := os.ReadDir(data)
require.NoError(t, err)
for _, entry := range entries {
entry := entry
t.Run(entry.Name(), func(t *testing.T) {
t.Parallel()
require.NoError(t, RunTrace(filepath.Join(data, entry.Name()), nil,
WithLogger(logtest.New(t)), WithTracer(WithOutput("/tmp/trace"))))
})
}
}
31 changes: 31 additions & 0 deletions tortoise/trace/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
package main

import (
"flag"
"runtime"

"go.uber.org/zap"
"go.uber.org/zap/zapcore"

"github.com/spacemeshos/go-spacemesh/log"
"github.com/spacemeshos/go-spacemesh/tortoise"
)

var (
level = zap.LevelFlag("level", zapcore.ErrorLevel, "set verbosity level for execution")
bpoint = flag.Bool("breakpoint", false, "enable breakpoint after every step")
)

func main() {
flag.Parse()
atom := zap.NewAtomicLevelAt(*level)
logger := log.NewWithLevel("trace", atom)
logger.With().Debug("using trace", log.String("path", flag.Arg(0)))
var breakpoint func()
if *bpoint {
breakpoint = runtime.Breakpoint
}
if err := tortoise.RunTrace(flag.Arg(0), breakpoint, tortoise.WithLogger(logger)); err != nil {
logger.With().Fatal("run trace failed", log.Err(err))
}
}
Loading

0 comments on commit bb80c6e

Please sign in to comment.