From 6609b77ba28b709bc0de3177efda15a6a040960e Mon Sep 17 00:00:00 2001 From: Gyu-Ho Lee Date: Tue, 12 Sep 2017 22:32:25 -0700 Subject: [PATCH] e2e: test alarm CORRUPT Signed-off-by: Gyu-Ho Lee --- e2e/cluster_test.go | 7 +++ e2e/ctl_v3_alarm_test.go | 104 +++++++++++++++++++++++++++++++++++++++ e2e/ctl_v3_test.go | 8 +++ 3 files changed, 119 insertions(+) diff --git a/e2e/cluster_test.go b/e2e/cluster_test.go index 0ea7bd24bb94..9fceede879db 100644 --- a/e2e/cluster_test.go +++ b/e2e/cluster_test.go @@ -20,6 +20,7 @@ import ( "net/url" "os" "strings" + "time" "github.com/coreos/etcd/etcdserver" ) @@ -115,6 +116,7 @@ type etcdProcessClusterConfig struct { forceNewCluster bool initialToken string quotaBackendBytes int64 + corruptCheckTime time.Duration noStrictReconfig bool } @@ -221,6 +223,11 @@ func (cfg *etcdProcessClusterConfig) etcdServerProcessConfigs() []*etcdServerPro "--quota-backend-bytes", fmt.Sprintf("%d", cfg.quotaBackendBytes), ) } + if cfg.corruptCheckTime > 0 { + args = append(args, + "--experimental-corrupt-check-time", cfg.corruptCheckTime.String(), + ) + } if cfg.noStrictReconfig { args = append(args, "--strict-reconfig-check=false") } diff --git a/e2e/ctl_v3_alarm_test.go b/e2e/ctl_v3_alarm_test.go index 395ac297a239..61092a0ca44d 100644 --- a/e2e/ctl_v3_alarm_test.go +++ b/e2e/ctl_v3_alarm_test.go @@ -16,12 +16,17 @@ package e2e import ( "context" + "fmt" "os" + "path/filepath" "strings" "testing" "time" "github.com/coreos/etcd/clientv3" + "github.com/coreos/etcd/etcdserver/api/v3rpc/rpctypes" + "github.com/coreos/etcd/mvcc" + "github.com/coreos/etcd/mvcc/backend" ) func TestCtlV3Alarm(t *testing.T) { @@ -29,6 +34,10 @@ func TestCtlV3Alarm(t *testing.T) { testCtl(t, alarmTest, withQuota(int64(13*os.Getpagesize()))) } +func TestCtlV3AlarmCorrupt(t *testing.T) { + testCtl(t, alarmCorruptTest, withCfg(configNoTLS), withQuorum(), withCorruptCheckTime(2*time.Second)) +} + func alarmTest(cx ctlCtx) { // test small put still works smallbuf := strings.Repeat("a", 64) @@ -99,6 +108,101 @@ func alarmTest(cx ctlCtx) { } } +type fakeConsistentIndex struct{ rev uint64 } + +func (f *fakeConsistentIndex) ConsistentIndex() uint64 { return f.rev } + +func alarmCorruptTest(cx ctlCtx) { + for i := 0; i < 10; i++ { + if err := ctlV3Put(cx, "k", "v", ""); err != nil { + if cx.dialTimeout > 0 && !isGRPCTimedout(err) { + cx.t.Fatalf("putTest ctlV3Put error (%v)", err) + } + } + } + cx.epc.procs[0].Stop() + + // Corrupt member 0 by modifying backend offline. + fp := filepath.Join(cx.epc.procs[0].Config().dataDirPath, "member", "snap", "db") + be := backend.NewDefaultBackend(fp) + s := mvcc.NewStore(be, nil, &fakeConsistentIndex{13}) + s.Put([]byte("abc"), []byte("def"), 0) + s.Put([]byte("xyz"), []byte("123"), 0) + s.Compact(5) + s.Commit() + s.Close() + be.Close() + + eps := cx.epc.EndpointsV3() + + // Wait for cluster so Puts succeed in case member 0 was the leader. + cli1, err := clientv3.New(clientv3.Config{Endpoints: []string{eps[1]}, DialTimeout: 3 * time.Second}) + if err != nil { + cx.t.Fatal(err) + } + defer cli1.Close() + + if _, err = cli1.Get(context.TODO(), "k"); err != nil { + cx.t.Fatal(err) + } + cli1.Put(context.TODO(), "xyz", "321") + cli1.Put(context.TODO(), "abc", "fed") + + // Restart with corruption checking + cx.epc.procs[1].Stop() + cx.epc.procs[2].Stop() + cx.epc.Restart() + + // wait until corruption detected + cli0, err := clientv3.New(clientv3.Config{Endpoints: []string{eps[0]}, DialTimeout: 3 * time.Second}) + if err != nil { + cx.t.Fatal(err) + } + defer cli0.Close() + + sresp, err := cli1.Status(context.TODO(), eps[0]) + if err != nil { + cx.t.Fatal(err) + } + id0 := sresp.Header.GetMemberId() + + corrupted := false + for i := 0; i < 5; i++ { + presp, perr := cli0.Put(context.TODO(), "abc", "aaa") + if perr != nil { + if perr.Error() != rpctypes.ErrCorrupt.Error() { + cx.t.Fatalf("expected %v, got %+v (%v)", rpctypes.ErrCorrupt, presp, perr) + } + corrupted = true + break + } + time.Sleep(time.Second) + } + if !corrupted { + cx.t.Fatalf("expected error %v after %s", rpctypes.ErrCorrupt, 5*time.Second) + } + + // corrupt alarm should now be on + if err = ctlV3Alarm(cx, "list", "alarm:CORRUPT"); err != nil { + cx.t.Fatal(err) + } + + // '/health' handler should return 'false' + if err = cURLGet(cx.epc, cURLReq{endpoint: "/health", expected: `{"health":false,"errors":["CORRUPT"]}`}); err != nil { + cx.t.Fatalf("failed get with curl (%v)", err) + } + + // corrupted member should fail to restart + cx.epc.procs[0].Stop() + ep := cx.epc.procs[0] + proc, err := spawnCmd(append([]string{ep.Config().execPath}, ep.Config().args...)) + if err != nil { + cx.t.Fatal(err) + } + defer proc.Stop() + waitReadyExpectProc(proc, []string{fmt.Sprintf("etcdserver: corrupted %016x", id0)}) +} + func ctlV3Alarm(cx ctlCtx, cmd string, as ...string) error { cmdArgs := append(cx.PrefixArgs(), "alarm", cmd) return spawnWithExpects(cmdArgs, as...) diff --git a/e2e/ctl_v3_test.go b/e2e/ctl_v3_test.go index 28b88b762217..358fb3d9dad0 100644 --- a/e2e/ctl_v3_test.go +++ b/e2e/ctl_v3_test.go @@ -55,6 +55,7 @@ type ctlCtx struct { t *testing.T cfg etcdProcessClusterConfig quotaBackendBytes int64 + corruptCheckTime time.Duration noStrictReconfig bool epc *etcdProcessCluster @@ -101,6 +102,10 @@ func withQuota(b int64) ctlOption { return func(cx *ctlCtx) { cx.quotaBackendBytes = b } } +func withCorruptCheckTime(d time.Duration) ctlOption { + return func(cx *ctlCtx) { cx.corruptCheckTime = d } +} + func withCompactPhysical() ctlOption { return func(cx *ctlCtx) { cx.compactPhysical = true } } @@ -130,6 +135,9 @@ func testCtl(t *testing.T, testFunc func(ctlCtx), opts ...ctlOption) { if ret.quotaBackendBytes > 0 { ret.cfg.quotaBackendBytes = ret.quotaBackendBytes } + if ret.corruptCheckTime > 0 { + ret.cfg.corruptCheckTime = ret.corruptCheckTime + } ret.cfg.noStrictReconfig = ret.noStrictReconfig epc, err := newEtcdProcessCluster(&ret.cfg)