From ea4effce8442357ab01b7d5e940599808d0ac080 Mon Sep 17 00:00:00 2001 From: Gyuho Lee Date: Thu, 5 Apr 2018 11:45:44 -0700 Subject: [PATCH] functional-tester/tester: add randomize delay Signed-off-by: Gyuho Lee --- tools/functional-tester/tester/cluster.go | 43 +++++++++--- .../functional-tester/tester/cluster_test.go | 17 ++--- .../tester/cluster_tester.go | 1 + tools/functional-tester/tester/failure.go | 15 ++++- .../tester/failure_case_network_delay.go | 66 ++++++++++++++++--- .../functional-tester/tester/local-test.yaml | 2 +- 6 files changed, 116 insertions(+), 28 deletions(-) diff --git a/tools/functional-tester/tester/cluster.go b/tools/functional-tester/tester/cluster.go index 417b93054f4..915d571897a 100644 --- a/tools/functional-tester/tester/cluster.go +++ b/tools/functional-tester/tester/cluster.go @@ -170,8 +170,11 @@ func newCluster(lg *zap.Logger, fpath string) (*Cluster, error) { } } - if clus.Tester.DelayLatencyMs <= clus.Tester.DelayLatencyMsRv { - return nil, fmt.Errorf("delay latency %d ms must be greater than delay latency random variable %d ms", clus.Tester.DelayLatencyMs, clus.Tester.DelayLatencyMsRv) + if clus.Tester.DelayLatencyMs <= clus.Tester.DelayLatencyMsRv*5 { + return nil, fmt.Errorf("delay latency %d ms must be greater than 5x of delay latency random variable %d ms", clus.Tester.DelayLatencyMs, clus.Tester.DelayLatencyMsRv) + } + if clus.Tester.UpdatedDelayLatencyMs == 0 { + clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs } for _, v := range clus.Tester.FailureCases { @@ -303,17 +306,29 @@ func (clus *Cluster) updateFailures() { clus.failures = append(clus.failures, newFailureBlackholePeerPortTxRxAll(clus)) case "DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER": - clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxOneFollower(clus)) + clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxOneFollower(clus, false)) + case "RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER": + clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxOneFollower(clus, true)) case "DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT": - clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxOneFollowerUntilTriggerSnapshot()) + clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxOneFollowerUntilTriggerSnapshot(clus, false)) + case "RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT": + clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxOneFollowerUntilTriggerSnapshot(clus, true)) case "DELAY_PEER_PORT_TX_RX_LEADER": - clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxLeader(clus)) + clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxLeader(clus, false)) + case "RANDOM_DELAY_PEER_PORT_TX_RX_LEADER": + clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxLeader(clus, true)) case "DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT": - clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxLeaderUntilTriggerSnapshot()) + clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxLeaderUntilTriggerSnapshot(clus, false)) + case "RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT": + clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxLeaderUntilTriggerSnapshot(clus, true)) case "DELAY_PEER_PORT_TX_RX_QUORUM": - clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxQuorum(clus)) + clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxQuorum(clus, false)) + case "RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM": + clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxQuorum(clus, true)) case "DELAY_PEER_PORT_TX_RX_ALL": - clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxAll(clus)) + clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxAll(clus, false)) + case "RANDOM_DELAY_PEER_PORT_TX_RX_ALL": + clus.failures = append(clus.failures, newFailureDelayPeerPortTxRxAll(clus, true)) case "NO_FAIL_WITH_STRESS": clus.failures = append(clus.failures, newFailureNoFailWithStress(clus)) @@ -340,6 +355,18 @@ func (clus *Cluster) failureStrings() (fs []string) { return fs } +// UpdateDelayLatencyMs updates delay latency with random value +// within election timeout. +func (clus *Cluster) UpdateDelayLatencyMs() { + rand.Seed(time.Now().UnixNano()) + clus.Tester.UpdatedDelayLatencyMs = uint32(rand.Int63n(clus.Members[0].Etcd.ElectionTimeoutMs)) + + minLatRv := clus.Tester.DelayLatencyMsRv + clus.Tester.DelayLatencyMsRv/5 + if clus.Tester.UpdatedDelayLatencyMs <= minLatRv { + clus.Tester.UpdatedDelayLatencyMs += minLatRv + } +} + func (clus *Cluster) shuffleFailures() { rand.Seed(time.Now().UnixNano()) offset := rand.Intn(1000) diff --git a/tools/functional-tester/tester/cluster_test.go b/tools/functional-tester/tester/cluster_test.go index 04706ad7df1..9d1e3a49c87 100644 --- a/tools/functional-tester/tester/cluster_test.go +++ b/tools/functional-tester/tester/cluster_test.go @@ -116,14 +116,15 @@ func Test_newCluster(t *testing.T) { }, }, Tester: &rpcpb.Tester{ - TesterNetwork: "tcp", - TesterAddr: "127.0.0.1:9028", - DelayLatencyMs: 5000, - DelayLatencyMsRv: 150, - RoundLimit: 1, - ExitOnFailure: true, - ConsistencyCheck: true, - EnablePprof: true, + TesterNetwork: "tcp", + TesterAddr: "127.0.0.1:9028", + DelayLatencyMs: 5000, + DelayLatencyMsRv: 500, + UpdatedDelayLatencyMs: 5000, + RoundLimit: 1, + ExitOnFailure: true, + ConsistencyCheck: true, + EnablePprof: true, FailureCases: []string{ "KILL_ONE_FOLLOWER", "KILL_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT", diff --git a/tools/functional-tester/tester/cluster_tester.go b/tools/functional-tester/tester/cluster_tester.go index 95a5b29fa70..1db87a4572f 100644 --- a/tools/functional-tester/tester/cluster_tester.go +++ b/tools/functional-tester/tester/cluster_tester.go @@ -111,6 +111,7 @@ func (clus *Cluster) doRound() error { "round START", zap.Int("round", clus.rd), zap.Strings("failures", clus.failureStrings()), + zap.Int("total-failures", len(clus.failures)), ) for i, fa := range clus.failures { clus.cs = i diff --git a/tools/functional-tester/tester/failure.go b/tools/functional-tester/tester/failure.go index a1f22ce55a7..2a1a20e9dbd 100644 --- a/tools/functional-tester/tester/failure.go +++ b/tools/functional-tester/tester/failure.go @@ -224,9 +224,18 @@ type failureUntilSnapshot struct { Failure } +// all delay failure cases except the ones failing with latency +// greater than election timeout (trigger leader election and +// cluster keeps operating anyways) var slowCases = map[rpcpb.FailureCase]bool{ - rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT: true, - rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT: true, + rpcpb.FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER: true, + rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT: true, + rpcpb.FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT: true, + rpcpb.FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_LEADER: true, + rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT: true, + rpcpb.FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT: true, + rpcpb.FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM: true, + rpcpb.FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_ALL: true, } func (f *failureUntilSnapshot) Inject(clus *Cluster) error { @@ -263,7 +272,7 @@ func (f *failureUntilSnapshot) Inject(clus *Cluster) error { retries := int(snapshotCount) / 1000 * 3 if v, ok := slowCases[f.FailureCase()]; v && ok { // slow network takes more retries - retries *= 2 + retries *= 5 } for i := 0; i < retries; i++ { diff --git a/tools/functional-tester/tester/failure_case_network_delay.go b/tools/functional-tester/tester/failure_case_network_delay.go index 4a3161e2c0f..e9920376870 100644 --- a/tools/functional-tester/tester/failure_case_network_delay.go +++ b/tools/functional-tester/tester/failure_case_network_delay.go @@ -18,6 +18,8 @@ import ( "time" "github.com/coreos/etcd/tools/functional-tester/rpcpb" + + "go.uber.org/zap" ) const ( @@ -29,6 +31,12 @@ const ( ) func injectDelayPeerPortTxRx(clus *Cluster, idx int) error { + clus.lg.Info( + "injecting delay latency", + zap.Duration("latency", time.Duration(clus.Tester.UpdatedDelayLatencyMs)*time.Millisecond), + zap.Duration("latency-rv", time.Duration(clus.Tester.DelayLatencyMsRv)*time.Millisecond), + zap.String("endpoint", clus.Members[idx].EtcdClientEndpoint), + ) return clus.sendOperation(idx, rpcpb.Operation_DelayPeerPortTxRx) } @@ -38,12 +46,19 @@ func recoverDelayPeerPortTxRx(clus *Cluster, idx int) error { return err } -func newFailureDelayPeerPortTxRxOneFollower(clus *Cluster) Failure { +func newFailureDelayPeerPortTxRxOneFollower(clus *Cluster, random bool) Failure { ff := failureByFunc{ failureCase: rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER, injectMember: injectDelayPeerPortTxRx, recoverMember: recoverDelayPeerPortTxRx, } + + clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs + if random { + clus.UpdateDelayLatencyMs() + ff.failureCase = rpcpb.FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER + } + f := &failureFollower{ff, -1, -1} return &failureDelay{ Failure: f, @@ -51,25 +66,39 @@ func newFailureDelayPeerPortTxRxOneFollower(clus *Cluster) Failure { } } -func newFailureDelayPeerPortTxRxOneFollowerUntilTriggerSnapshot() Failure { +func newFailureDelayPeerPortTxRxOneFollowerUntilTriggerSnapshot(clus *Cluster, random bool) Failure { ff := failureByFunc{ failureCase: rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT, injectMember: injectDelayPeerPortTxRx, recoverMember: recoverDelayPeerPortTxRx, } + + clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs + if random { + clus.UpdateDelayLatencyMs() + ff.failureCase = rpcpb.FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT + } + f := &failureFollower{ff, -1, -1} return &failureUntilSnapshot{ - failureCase: rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT, + failureCase: ff.failureCase, Failure: f, } } -func newFailureDelayPeerPortTxRxLeader(clus *Cluster) Failure { +func newFailureDelayPeerPortTxRxLeader(clus *Cluster, random bool) Failure { ff := failureByFunc{ failureCase: rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_LEADER, injectMember: injectDelayPeerPortTxRx, recoverMember: recoverDelayPeerPortTxRx, } + + clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs + if random { + clus.UpdateDelayLatencyMs() + ff.failureCase = rpcpb.FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_LEADER + } + f := &failureLeader{ff, -1, -1} return &failureDelay{ Failure: f, @@ -77,37 +106,58 @@ func newFailureDelayPeerPortTxRxLeader(clus *Cluster) Failure { } } -func newFailureDelayPeerPortTxRxLeaderUntilTriggerSnapshot() Failure { +func newFailureDelayPeerPortTxRxLeaderUntilTriggerSnapshot(clus *Cluster, random bool) Failure { ff := failureByFunc{ failureCase: rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT, injectMember: injectDelayPeerPortTxRx, recoverMember: recoverDelayPeerPortTxRx, } + + clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs + if random { + clus.UpdateDelayLatencyMs() + ff.failureCase = rpcpb.FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT + } + f := &failureLeader{ff, -1, -1} return &failureUntilSnapshot{ - failureCase: rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_LEADER_UNTIL_TRIGGER_SNAPSHOT, + failureCase: ff.failureCase, Failure: f, } } -func newFailureDelayPeerPortTxRxQuorum(clus *Cluster) Failure { +func newFailureDelayPeerPortTxRxQuorum(clus *Cluster, random bool) Failure { f := &failureQuorum{ failureCase: rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_QUORUM, injectMember: injectDelayPeerPortTxRx, recoverMember: recoverDelayPeerPortTxRx, } + + clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs + if random { + clus.UpdateDelayLatencyMs() + f.failureCase = rpcpb.FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_QUORUM + } + return &failureDelay{ Failure: f, delayDuration: clus.GetFailureDelayDuration(), } } -func newFailureDelayPeerPortTxRxAll(clus *Cluster) Failure { +func newFailureDelayPeerPortTxRxAll(clus *Cluster, random bool) Failure { f := &failureAll{ failureCase: rpcpb.FailureCase_DELAY_PEER_PORT_TX_RX_ALL, injectMember: injectDelayPeerPortTxRx, recoverMember: recoverDelayPeerPortTxRx, } + + clus.Tester.UpdatedDelayLatencyMs = clus.Tester.DelayLatencyMs + if random { + clus.UpdateDelayLatencyMs() + f.failureCase = rpcpb.FailureCase_RANDOM_DELAY_PEER_PORT_TX_RX_ALL + } + return &failureDelay{ Failure: f, delayDuration: clus.GetFailureDelayDuration(), diff --git a/tools/functional-tester/tester/local-test.yaml b/tools/functional-tester/tester/local-test.yaml index 2c18805f5ea..d3f2f188983 100644 --- a/tools/functional-tester/tester/local-test.yaml +++ b/tools/functional-tester/tester/local-test.yaml @@ -84,7 +84,7 @@ tester-config: # slow enough to trigger election delay-latency-ms: 5000 - delay-latency-ms-rv: 150 + delay-latency-ms-rv: 500 round-limit: 1 exit-on-failure: true