From 612a1b69cadd7e524bac88a924bf5368ac13f6a1 Mon Sep 17 00:00:00 2001 From: nolouch Date: Wed, 17 Jul 2019 11:23:05 +0800 Subject: [PATCH 1/4] server/join: add retry for join member Signed-off-by: nolouch --- server/join.go | 64 +++++++++++++++++++++++++--------- tests/server/join/join_test.go | 20 +++++++++++ 2 files changed, 68 insertions(+), 16 deletions(-) diff --git a/server/join.go b/server/join.go index a4b4d3ebfa5..89221bf7c02 100644 --- a/server/join.go +++ b/server/join.go @@ -19,7 +19,9 @@ import ( "os" "path" "strings" + "time" + "github.com/pingcap/failpoint" log "github.com/pingcap/log" "github.com/pingcap/pd/pkg/etcdutil" "github.com/pkg/errors" @@ -35,6 +37,9 @@ const ( privateDirMode = 0700 ) +// listMemberRetryTimes the retry times of list member. +var listMemberRetryTimes = 20 + // PrepareJoinCluster sends MemberAdd command to PD cluster, // and returns the initial configuration of the PD cluster. // @@ -137,31 +142,58 @@ func PrepareJoinCluster(cfg *Config) error { return errors.New("missing data or join a duplicated pd") } + var addResp *clientv3.MemberAddResponse + + failpoint.Inject("add-member-failed", func() { + listMemberRetryTimes = 2 + failpoint.Goto("LabelSkipAddMember") + }) // - A new PD joins an existing cluster. // - A deleted PD joins to previous cluster. - addResp, err := etcdutil.AddEtcdMember(client, []string{cfg.AdvertisePeerUrls}) - if err != nil { - return err + { + // add member throught API firstly. + addResp, err = etcdutil.AddEtcdMember(client, []string{cfg.AdvertisePeerUrls}) + if err != nil { + return err + } } + failpoint.Label("LabelSkipAddMember") - listResp, err = etcdutil.ListEtcdMembers(client) - if err != nil { - return err - } + var ( + pds []string + listSucc bool + ) - pds := []string{} - for _, memb := range listResp.Members { - n := memb.Name - if memb.ID == addResp.Member.ID { - n = cfg.Name + for i := 0; i < listMemberRetryTimes; i++ { + listResp, err = etcdutil.ListEtcdMembers(client) + if err != nil { + return err } - if len(n) == 0 { - return errors.New("there is a member that has not joined successfully") + + pds = []string{} + for _, memb := range listResp.Members { + n := memb.Name + if addResp != nil && memb.ID == addResp.Member.ID { + n = cfg.Name + listSucc = true + } + if len(n) == 0 { + return errors.New("there is a member that has not joined successfully") + } + for _, m := range memb.PeerURLs { + pds = append(pds, fmt.Sprintf("%s=%s", n, m)) + } } - for _, m := range memb.PeerURLs { - pds = append(pds, fmt.Sprintf("%s=%s", n, m)) + + if listSucc { + break } + time.Sleep(500 * time.Millisecond) } + if !listSucc { + return errors.Errorf("join failed, adds the new member %s may failed", cfg.Name) + } + initialCluster = strings.Join(pds, ",") cfg.InitialCluster = initialCluster cfg.InitialClusterState = embed.ClusterStateFlagExisting diff --git a/tests/server/join/join_test.go b/tests/server/join/join_test.go index 54828202fdf..729ab67c535 100644 --- a/tests/server/join/join_test.go +++ b/tests/server/join/join_test.go @@ -17,10 +17,12 @@ import ( "context" "os" "path" + "strings" "testing" "time" . "github.com/pingcap/check" + "github.com/pingcap/failpoint" "github.com/pingcap/pd/pkg/etcdutil" "github.com/pingcap/pd/server" "github.com/pingcap/pd/tests" @@ -178,3 +180,21 @@ func (s *serverTestSuite) TestFailedPDJoinsPreviousCluster(c *C) { c.Assert(err, IsNil) c.Assert(server.PrepareJoinCluster(pd2.GetConfig()), NotNil) } + +func (s *serverTestSuite) TestFailedPDJoinInStep1(c *C) { + c.Parallel() + + cluster, err := tests.NewTestCluster(1) + c.Assert(err, IsNil) + defer cluster.Destroy() + + err = cluster.RunInitialServers() + c.Assert(err, IsNil) + cluster.WaitLeader() + + // Join the second PD. + c.Assert(failpoint.Enable("github.com/pingcap/pd/server/add-member-failed", `return`), IsNil) + _, err = cluster.Join() + c.Assert(err, NotNil) + c.Assert(strings.Contains(err.Error(), "join failed"), IsTrue) +} From 00157352ba9d145b7eefba88a2ec842573ed4c61 Mon Sep 17 00:00:00 2001 From: nolouch Date: Wed, 17 Jul 2019 15:33:40 +0800 Subject: [PATCH 2/4] fix Signed-off-by: nolouch --- server/join.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/join.go b/server/join.go index 89221bf7c02..879d7a6e595 100644 --- a/server/join.go +++ b/server/join.go @@ -151,7 +151,7 @@ func PrepareJoinCluster(cfg *Config) error { // - A new PD joins an existing cluster. // - A deleted PD joins to previous cluster. { - // add member throught API firstly. + // First adds member through the API addResp, err = etcdutil.AddEtcdMember(client, []string{cfg.AdvertisePeerUrls}) if err != nil { return err From bb76e6e898ff24da6cc59d62d51f48ca71a8e919 Mon Sep 17 00:00:00 2001 From: nolouch Date: Wed, 17 Jul 2019 16:17:02 +0800 Subject: [PATCH 3/4] try to fix ci Signed-off-by: nolouch --- tests/server/join/join_fail/join_fail_test.go | 53 +++++++++++++++++++ tests/server/join/join_test.go | 20 ------- 2 files changed, 53 insertions(+), 20 deletions(-) create mode 100644 tests/server/join/join_fail/join_fail_test.go diff --git a/tests/server/join/join_fail/join_fail_test.go b/tests/server/join/join_fail/join_fail_test.go new file mode 100644 index 00000000000..f4df51c3d5d --- /dev/null +++ b/tests/server/join/join_fail/join_fail_test.go @@ -0,0 +1,53 @@ +// Copyright 2019 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package join_fail_test + +import ( + "strings" + "testing" + + . "github.com/pingcap/check" + "github.com/pingcap/failpoint" + "github.com/pingcap/pd/server" + "github.com/pingcap/pd/tests" +) + +func Test(t *testing.T) { + TestingT(t) +} + +var _ = Suite(&serverTestSuite{}) + +type serverTestSuite struct{} + +func (s *serverTestSuite) SetUpSuite(c *C) { + server.EnableZap = true +} + +func (s *serverTestSuite) TestFailedPDJoinInStep1(c *C) { + cluster, err := tests.NewTestCluster(1) + c.Assert(err, IsNil) + defer cluster.Destroy() + + err = cluster.RunInitialServers() + c.Assert(err, IsNil) + cluster.WaitLeader() + + // Join the second PD. + c.Assert(failpoint.Enable("github.com/pingcap/pd/server/add-member-failed", `return`), IsNil) + _, err = cluster.Join() + c.Assert(err, NotNil) + c.Assert(strings.Contains(err.Error(), "join failed"), IsTrue) + c.Assert(failpoint.Disable("github.com/pingcap/pd/server/add-member-failed"), IsNil) +} diff --git a/tests/server/join/join_test.go b/tests/server/join/join_test.go index 729ab67c535..54828202fdf 100644 --- a/tests/server/join/join_test.go +++ b/tests/server/join/join_test.go @@ -17,12 +17,10 @@ import ( "context" "os" "path" - "strings" "testing" "time" . "github.com/pingcap/check" - "github.com/pingcap/failpoint" "github.com/pingcap/pd/pkg/etcdutil" "github.com/pingcap/pd/server" "github.com/pingcap/pd/tests" @@ -180,21 +178,3 @@ func (s *serverTestSuite) TestFailedPDJoinsPreviousCluster(c *C) { c.Assert(err, IsNil) c.Assert(server.PrepareJoinCluster(pd2.GetConfig()), NotNil) } - -func (s *serverTestSuite) TestFailedPDJoinInStep1(c *C) { - c.Parallel() - - cluster, err := tests.NewTestCluster(1) - c.Assert(err, IsNil) - defer cluster.Destroy() - - err = cluster.RunInitialServers() - c.Assert(err, IsNil) - cluster.WaitLeader() - - // Join the second PD. - c.Assert(failpoint.Enable("github.com/pingcap/pd/server/add-member-failed", `return`), IsNil) - _, err = cluster.Join() - c.Assert(err, NotNil) - c.Assert(strings.Contains(err.Error(), "join failed"), IsTrue) -} From d2cc3c886020a311ce0b6bdda54b2e1292080d19 Mon Sep 17 00:00:00 2001 From: nolouch Date: Thu, 18 Jul 2019 15:21:28 +0800 Subject: [PATCH 4/4] address comments Signed-off-by: nolouch --- server/join.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/join.go b/server/join.go index 879d7a6e595..744dd4af978 100644 --- a/server/join.go +++ b/server/join.go @@ -37,7 +37,7 @@ const ( privateDirMode = 0700 ) -// listMemberRetryTimes the retry times of list member. +// listMemberRetryTimes is the retry times of list member. var listMemberRetryTimes = 20 // PrepareJoinCluster sends MemberAdd command to PD cluster,