From 4122df2c634b9954969d6a9b5f57d27c139409ad Mon Sep 17 00:00:00 2001 From: disksing Date: Fri, 20 Dec 2019 10:55:05 +0800 Subject: [PATCH 1/7] make builder aware of placement rules Signed-off-by: disksing --- server/schedule/operator/builder.go | 28 +++++++++++++++++++++++- server/schedule/operator/builder_test.go | 2 +- server/schedule/operator/operator.go | 2 ++ 3 files changed, 30 insertions(+), 2 deletions(-) diff --git a/server/schedule/operator/builder.go b/server/schedule/operator/builder.go index 8b5ce4558ee..e5bd66013d9 100644 --- a/server/schedule/operator/builder.go +++ b/server/schedule/operator/builder.go @@ -20,6 +20,7 @@ import ( "github.com/pingcap/kvproto/pkg/metapb" "github.com/pingcap/pd/server/core" "github.com/pingcap/pd/server/schedule/filter" + "github.com/pingcap/pd/server/schedule/placement" "github.com/pkg/errors" ) @@ -37,6 +38,7 @@ type Builder struct { cluster Cluster regionID uint64 regionEpoch *metapb.RegionEpoch + rules []*placement.Rule // operation record originPeers peersMap @@ -67,11 +69,23 @@ func NewBuilder(desc string, cluster Cluster, region *core.RegionInfo) *Builder err = errors.Errorf("cannot build operator for region with no leader") } + var rules []*placement.Rule + if cluster.IsPlacementRulesEnabled() { + fit := cluster.FitRegion(region) + for _, rf := range fit.RuleFits { + rules = append(rules, rf.Rule) + } + if len(rules) == 0 { + err = errors.Errorf("cannot build operator for region match no placement rule") + } + } + return &Builder{ desc: desc, cluster: cluster, regionID: region.GetID(), regionEpoch: region.GetRegionEpoch(), + rules: rules, originPeers: originPeers, originLeader: region.GetLeader().GetStoreId(), targetPeers: originPeers.Copy(), @@ -338,7 +352,19 @@ func (b *Builder) allowLeader(peer *metapb.Peer) bool { return false } stateFilter := filter.StoreStateFilter{ActionScope: "operator-builder", TransferLeader: true} - return !stateFilter.Target(b.cluster, store) + if stateFilter.Target(b.cluster, store) { + return false + } + if len(b.rules) == 0 { + return true + } + for _, r := range b.rules { + if (r.Role == placement.Leader || r.Role == placement.Voter) && + placement.MatchLabelConstraints(store, r.LabelConstraints) { + return true + } + } + return false } // stepPlan is exec step. It can be: diff --git a/server/schedule/operator/builder_test.go b/server/schedule/operator/builder_test.go index 06faf997862..027f7919cb5 100644 --- a/server/schedule/operator/builder_test.go +++ b/server/schedule/operator/builder_test.go @@ -58,7 +58,7 @@ func (s *testBuilderSuite) TestNewBuilder(c *C) { c.Assert(builder.targetPeers.Get(1), DeepEquals, peers[0]) c.Assert(builder.targetPeers.Get(2), DeepEquals, peers[1]) region = region.Clone(core.WithLeader(nil)) - builder = NewBuilder("test", nil, region) + builder = NewBuilder("test", s.cluster, region) c.Assert(builder.err, NotNil) } diff --git a/server/schedule/operator/operator.go b/server/schedule/operator/operator.go index 187a8a0fab0..7be1069e089 100644 --- a/server/schedule/operator/operator.go +++ b/server/schedule/operator/operator.go @@ -23,6 +23,7 @@ import ( "github.com/pingcap/kvproto/pkg/metapb" "github.com/pingcap/pd/server/core" "github.com/pingcap/pd/server/schedule/opt" + "github.com/pingcap/pd/server/schedule/placement" ) const ( @@ -42,6 +43,7 @@ type Cluster interface { opt.Options GetStore(id uint64) *core.StoreInfo AllocPeer(storeID uint64) (*metapb.Peer, error) + FitRegion(region *core.RegionInfo) *placement.RegionFit } // Operator contains execution steps generated by scheduler. From 4b3c194ce46c22f1710ac2a0e49a2f6388af3c8d Mon Sep 17 00:00:00 2001 From: disksing Date: Fri, 20 Dec 2019 10:55:45 +0800 Subject: [PATCH 2/7] add rule checker Signed-off-by: disksing --- server/schedule/checker/rule_checker.go | 260 ++++++++++++++++++++++++ 1 file changed, 260 insertions(+) create mode 100644 server/schedule/checker/rule_checker.go diff --git a/server/schedule/checker/rule_checker.go b/server/schedule/checker/rule_checker.go new file mode 100644 index 00000000000..7df431b8127 --- /dev/null +++ b/server/schedule/checker/rule_checker.go @@ -0,0 +1,260 @@ +// Copyright 2019 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package checker + +import ( + "github.com/pingcap/kvproto/pkg/metapb" + "github.com/pingcap/kvproto/pkg/pdpb" + "github.com/pingcap/log" + "github.com/pingcap/pd/server/core" + "github.com/pingcap/pd/server/schedule/filter" + "github.com/pingcap/pd/server/schedule/operator" + "github.com/pingcap/pd/server/schedule/opt" + "github.com/pingcap/pd/server/schedule/placement" + "github.com/pingcap/pd/server/schedule/selector" + "go.uber.org/zap" +) + +// RuleChecker fix/improve region by placement rules. +type RuleChecker struct { + cluster opt.Cluster + ruleManager *placement.RuleManager + name string +} + +// NewRuleChecker creates an checker instance. +func NewRuleChecker(cluster opt.Cluster, ruleManager *placement.RuleManager) *RuleChecker { + return &RuleChecker{ + cluster: cluster, + ruleManager: ruleManager, + name: "rule-checker", + } +} + +// Check checks if the region matches placement rules and returns Operator to +// fix it. +func (c *RuleChecker) Check(region *core.RegionInfo) *operator.Operator { + checkerCounter.WithLabelValues("rule_checker", "check").Inc() + + fit := c.cluster.FitRegion(region) + if len(fit.RuleFits) == 0 { + // If the region matches no rules, the most possible reason is it spans across + // multiple rules. + return c.fixRange(region) + } + for _, rf := range fit.RuleFits { + if op := c.fixRulePeer(region, fit, rf); op != nil { + return op + } + } + return c.fixOrphanPeers(region, fit) +} + +func (c *RuleChecker) fixRange(region *core.RegionInfo) *operator.Operator { + keys := c.ruleManager.GetSplitKeys(region.GetStartKey(), region.GetEndKey()) + if len(keys) == 0 { + return nil + } + return operator.CreateSplitRegionOperator("rule-split-region", region, 0, pdpb.CheckPolicy_USEKEY, keys) +} + +func (c *RuleChecker) fixRulePeer(region *core.RegionInfo, fit *placement.RegionFit, rf *placement.RuleFit) *operator.Operator { + // make up peers. + if len(rf.Peers) < rf.Rule.Count { + if op := c.addRulePeer(region, rf); op != nil { + return op + } + } + // fix down/offline peers. + for _, peer := range rf.Peers { + if c.isDownPeer(region, peer) || c.isOfflinePeer(region, peer) { + if op := c.replaceRulePeer(region, fit, rf, peer); op != nil { + return op + } + } + } + // fix loose matched peers. + for _, peer := range rf.PeersWithDifferentRole { + if peer.IsLearner && rf.Rule.Role != placement.Learner { + op, err := operator.CreatePromoteLearnerOperator("fix-peer-role", c.cluster, region, peer) + if err != nil { + log.Error("fail to create fix-peer-role operator", zap.Error(err)) + return nil + } + return op + } + if region.GetLeader().GetId() == peer.GetId() && rf.Rule.Role == placement.Follower { + for _, candidate := range region.GetPeers() { + if candidate.GetIsLearner() { + continue + } + op, err := operator.CreateTransferLeaderOperator("fix-peer-role", c.cluster, region, peer.GetStoreId(), candidate.GetStoreId(), 0) + if err == nil { + return op + } + } + log.Debug("fail to transfer leader: no valid new leader") + return nil + } + } + if op := c.checkBestReplacement(region, fit, rf); op != nil { + return op + } + return nil +} + +func (c *RuleChecker) checkBestReplacement(region *core.RegionInfo, fit *placement.RegionFit, rf *placement.RuleFit) *operator.Operator { + if len(rf.Rule.LocationLabels) == 0 || rf.Rule.Count <= 1 { + return nil + } + stores := getRuleFitStores(c.cluster, rf) + s := selector.NewReplicaSelector(stores, rf.Rule.LocationLabels, filter.StoreStateFilter{ActionScope: "rule-checker", MoveRegion: true}) + oldPeerStore := s.SelectSource(c.cluster, stores) + if oldPeerStore == nil { + return nil + } + oldPeer := region.GetStorePeer(oldPeerStore.GetID()) + newPeerStore := SelectStoreToReplacePeerByRule("rule-checker", c.cluster, region, fit, rf, oldPeer) + stores = getRuleFitStores(c.cluster, removePeerFromRuleFit(rf, oldPeer)) + oldScore := core.DistinctScore(rf.Rule.LocationLabels, stores, oldPeerStore) + newScore := core.DistinctScore(rf.Rule.LocationLabels, stores, newPeerStore) + if newScore <= oldScore { + log.Debug("no better peer", zap.Uint64("region-id", region.GetID()), zap.Float64("new-score", newScore), zap.Float64("old-score", oldScore)) + return nil + } + newPeer := &metapb.Peer{StoreId: newPeerStore.GetID(), IsLearner: oldPeer.IsLearner} + op, err := operator.CreateMovePeerOperator("move-to-better-location", c.cluster, region, operator.OpReplica, oldPeer.GetStoreId(), newPeer) + if err != nil { + log.Debug("failed to create move-to-better-location operator", zap.Error(err)) + return nil + } + return op +} + +func (c *RuleChecker) fixOrphanPeers(region *core.RegionInfo, fit *placement.RegionFit) *operator.Operator { + if len(fit.OrphanPeers) == 0 { + return nil + } + peer := fit.OrphanPeers[0] + op, err := operator.CreateRemovePeerOperator("remove-orphan-peers", c.cluster, 0, region, peer.StoreId) + if err != nil { + log.Debug("failed to remove orphan peer", zap.Error(err)) + return nil + } + return op +} + +func (c *RuleChecker) addRulePeer(region *core.RegionInfo, rf *placement.RuleFit) *operator.Operator { + store := SelectStoreToAddPeerByRule(c.name, c.cluster, region, rf) + if store == nil { + return nil + } + peer := &metapb.Peer{StoreId: store.GetID(), IsLearner: rf.Rule.Role == placement.Learner} + op, err := operator.CreateAddPeerOperator("add-rule-peer", c.cluster, region, peer, operator.OpReplica) + if err != nil { + log.Debug("failed to create add rule peer operator", zap.Error(err)) + return nil + } + return op +} + +func (c *RuleChecker) replaceRulePeer(region *core.RegionInfo, fit *placement.RegionFit, rf *placement.RuleFit, peer *metapb.Peer) *operator.Operator { + store := SelectStoreToReplacePeerByRule(c.name, c.cluster, region, fit, rf, peer) + if store == nil { + return nil + } + newPeer := &metapb.Peer{StoreId: store.GetID(), IsLearner: rf.Rule.Role == placement.Learner} + op, err := operator.CreateMovePeerOperator("replace-rule-peer", c.cluster, region, operator.OpReplica, peer.StoreId, newPeer) + if err != nil { + log.Error("failed to move peer", zap.Error(err)) + return nil + } + return op +} + +func (c *RuleChecker) isDownPeer(region *core.RegionInfo, peer *metapb.Peer) bool { + for _, stats := range region.GetDownPeers() { + if stats.GetPeer().GetId() != peer.GetId() { + continue + } + storeID := peer.GetStoreId() + store := c.cluster.GetStore(storeID) + if store == nil { + log.Warn("lost the store, maybe you are recovering the PD cluster", zap.Uint64("store-id", storeID)) + return false + } + if store.DownTime() < c.cluster.GetMaxStoreDownTime() { + continue + } + if stats.GetDownSeconds() < uint64(c.cluster.GetMaxStoreDownTime().Seconds()) { + continue + } + return true + } + return false +} + +func (c *RuleChecker) isOfflinePeer(region *core.RegionInfo, peer *metapb.Peer) bool { + store := c.cluster.GetStore(peer.GetStoreId()) + if store == nil { + log.Warn("lost the store, maybe you are recovering the PD cluster", zap.Uint64("store-id", peer.StoreId)) + return false + } + return !store.IsUp() +} + +// SelectStoreToAddPeerByRule selects a store to add peer in order to fit the placement rule. +func SelectStoreToAddPeerByRule(scope string, cluster opt.Cluster, region *core.RegionInfo, rf *placement.RuleFit, filters ...filter.Filter) *core.StoreInfo { + fs := []filter.Filter{ + filter.StoreStateFilter{ActionScope: scope, MoveRegion: true}, + filter.NewStorageThresholdFilter(scope), + filter.NewLabelConstaintFilter(scope, rf.Rule.LabelConstraints), + filter.NewExcludedFilter(scope, nil, region.GetStoreIds()), + } + fs = append(fs, filters...) + store := selector.NewReplicaSelector(getRuleFitStores(cluster, rf), rf.Rule.LocationLabels). + SelectTarget(cluster, cluster.GetStores(), fs...) + return store +} + +// SelectStoreToReplacePeerByRule selects a store to replace a region peer in order to fit the placement rule. +func SelectStoreToReplacePeerByRule(scope string, cluster opt.Cluster, region *core.RegionInfo, fit *placement.RegionFit, rf *placement.RuleFit, peer *metapb.Peer, filters ...filter.Filter) *core.StoreInfo { + rf2 := removePeerFromRuleFit(rf, peer) + return SelectStoreToAddPeerByRule(scope, cluster, region, rf2, filters...) +} + +func getRuleFitStores(cluster opt.Cluster, fit *placement.RuleFit) []*core.StoreInfo { + var stores []*core.StoreInfo + for _, p := range fit.Peers { + if s := cluster.GetStore(p.GetStoreId()); s != nil { + stores = append(stores, s) + } + } + return stores +} + +func removePeerFromRuleFit(rf *placement.RuleFit, peer *metapb.Peer) *placement.RuleFit { + rf2 := &placement.RuleFit{Rule: rf.Rule} + for _, p := range rf.Peers { + if p.GetId() != peer.GetId() { + rf2.Peers = append(rf2.Peers, p) + } + } + for _, p := range rf.PeersWithDifferentRole { + if p.GetId() != peer.GetId() { + rf2.PeersWithDifferentRole = append(rf2.PeersWithDifferentRole, p) + } + } + return rf2 +} From a11bea9c6597aa22e7555a255e8a5e4ea0d19fdf Mon Sep 17 00:00:00 2001 From: disksing Date: Fri, 20 Dec 2019 11:20:51 +0800 Subject: [PATCH 3/7] fix warning Signed-off-by: disksing --- server/schedule/checker/rule_checker.go | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/server/schedule/checker/rule_checker.go b/server/schedule/checker/rule_checker.go index 7df431b8127..34f075bc9f6 100644 --- a/server/schedule/checker/rule_checker.go +++ b/server/schedule/checker/rule_checker.go @@ -108,10 +108,7 @@ func (c *RuleChecker) fixRulePeer(region *core.RegionInfo, fit *placement.Region return nil } } - if op := c.checkBestReplacement(region, fit, rf); op != nil { - return op - } - return nil + return c.checkBestReplacement(region, fit, rf) } func (c *RuleChecker) checkBestReplacement(region *core.RegionInfo, fit *placement.RegionFit, rf *placement.RuleFit) *operator.Operator { From 00ed0ae800112e90edb9abbeec1047e65c82ade8 Mon Sep 17 00:00:00 2001 From: disksing Date: Fri, 20 Dec 2019 13:07:35 +0800 Subject: [PATCH 4/7] add test and minor fix Signed-off-by: disksing --- .../schedule/checker/replica_checker_test.go | 5 - server/schedule/checker/rule_checker.go | 157 +++++++-------- server/schedule/checker/rule_checker_test.go | 179 ++++++++++++++++++ 3 files changed, 262 insertions(+), 79 deletions(-) create mode 100644 server/schedule/checker/rule_checker_test.go diff --git a/server/schedule/checker/replica_checker_test.go b/server/schedule/checker/replica_checker_test.go index 59ec1663001..119a033a353 100644 --- a/server/schedule/checker/replica_checker_test.go +++ b/server/schedule/checker/replica_checker_test.go @@ -14,7 +14,6 @@ package checker import ( - "testing" "time" . "github.com/pingcap/check" @@ -27,10 +26,6 @@ import ( "github.com/pingcap/pd/server/schedule/opt" ) -func TestReplicaChecker(t *testing.T) { - TestingT(t) -} - var _ = Suite(&testReplicaCheckerSuite{}) type testReplicaCheckerSuite struct { diff --git a/server/schedule/checker/rule_checker.go b/server/schedule/checker/rule_checker.go index 34f075bc9f6..74fcc777d5c 100644 --- a/server/schedule/checker/rule_checker.go +++ b/server/schedule/checker/rule_checker.go @@ -23,6 +23,7 @@ import ( "github.com/pingcap/pd/server/schedule/opt" "github.com/pingcap/pd/server/schedule/placement" "github.com/pingcap/pd/server/schedule/selector" + "github.com/pkg/errors" "go.uber.org/zap" ) @@ -54,11 +55,21 @@ func (c *RuleChecker) Check(region *core.RegionInfo) *operator.Operator { return c.fixRange(region) } for _, rf := range fit.RuleFits { - if op := c.fixRulePeer(region, fit, rf); op != nil { + op, err := c.fixRulePeer(region, fit, rf) + if err != nil { + log.Debug("fail to fix rule peer", zap.Error(err), zap.String("rule-group", rf.Rule.GroupID), zap.String("rule-id", rf.Rule.ID)) + return nil + } + if op != nil { return op } } - return c.fixOrphanPeers(region, fit) + op, err := c.fixOrphanPeers(region, fit) + if err != nil { + log.Debug("fail to fix orphan peer", zap.Error(err)) + return nil + } + return op } func (c *RuleChecker) fixRange(region *core.RegionInfo) *operator.Operator { @@ -69,57 +80,93 @@ func (c *RuleChecker) fixRange(region *core.RegionInfo) *operator.Operator { return operator.CreateSplitRegionOperator("rule-split-region", region, 0, pdpb.CheckPolicy_USEKEY, keys) } -func (c *RuleChecker) fixRulePeer(region *core.RegionInfo, fit *placement.RegionFit, rf *placement.RuleFit) *operator.Operator { +func (c *RuleChecker) fixRulePeer(region *core.RegionInfo, fit *placement.RegionFit, rf *placement.RuleFit) (*operator.Operator, error) { // make up peers. if len(rf.Peers) < rf.Rule.Count { - if op := c.addRulePeer(region, rf); op != nil { - return op - } + return c.addRulePeer(region, rf) } // fix down/offline peers. for _, peer := range rf.Peers { if c.isDownPeer(region, peer) || c.isOfflinePeer(region, peer) { - if op := c.replaceRulePeer(region, fit, rf, peer); op != nil { - return op - } + return c.replaceRulePeer(region, fit, rf, peer) } } // fix loose matched peers. for _, peer := range rf.PeersWithDifferentRole { - if peer.IsLearner && rf.Rule.Role != placement.Learner { - op, err := operator.CreatePromoteLearnerOperator("fix-peer-role", c.cluster, region, peer) - if err != nil { - log.Error("fail to create fix-peer-role operator", zap.Error(err)) - return nil - } - return op + op, err := c.fixLooseMatchPeer(region, fit, rf, peer) + if err != nil { + return nil, err + } + if op != nil { + return op, nil } - if region.GetLeader().GetId() == peer.GetId() && rf.Rule.Role == placement.Follower { - for _, candidate := range region.GetPeers() { - if candidate.GetIsLearner() { - continue - } - op, err := operator.CreateTransferLeaderOperator("fix-peer-role", c.cluster, region, peer.GetStoreId(), candidate.GetStoreId(), 0) - if err == nil { - return op - } + } + return c.fixBetterLocation(region, fit, rf) +} + +func (c *RuleChecker) addRulePeer(region *core.RegionInfo, rf *placement.RuleFit) (*operator.Operator, error) { + store := SelectStoreToAddPeerByRule(c.name, c.cluster, region, rf) + if store == nil { + return nil, errors.New("no store to add peer") + } + peer := &metapb.Peer{StoreId: store.GetID(), IsLearner: rf.Rule.Role == placement.Learner} + return operator.CreateAddPeerOperator("add-rule-peer", c.cluster, region, peer, operator.OpReplica) +} + +func (c *RuleChecker) replaceRulePeer(region *core.RegionInfo, fit *placement.RegionFit, rf *placement.RuleFit, peer *metapb.Peer) (*operator.Operator, error) { + store := SelectStoreToReplacePeerByRule(c.name, c.cluster, region, fit, rf, peer) + if store == nil { + return nil, errors.New("no store to add peer") + } + newPeer := &metapb.Peer{StoreId: store.GetID(), IsLearner: rf.Rule.Role == placement.Learner} + return operator.CreateMovePeerOperator("replace-rule-peer", c.cluster, region, operator.OpReplica, peer.StoreId, newPeer) +} + +func (c *RuleChecker) fixLooseMatchPeer(region *core.RegionInfo, fit *placement.RegionFit, rf *placement.RuleFit, peer *metapb.Peer) (*operator.Operator, error) { + if peer.IsLearner && rf.Rule.Role != placement.Learner { + return operator.CreatePromoteLearnerOperator("fix-peer-role", c.cluster, region, peer) + } + if region.GetLeader().GetId() == peer.GetId() && rf.Rule.Role == placement.Follower { + for _, p := range region.GetPeers() { + if c.allowLeader(fit, p) { + return operator.CreateTransferLeaderOperator("fix-peer-role", c.cluster, region, peer.GetStoreId(), p.GetStoreId(), 0) } - log.Debug("fail to transfer leader: no valid new leader") - return nil } + return nil, errors.New("no new leader") } - return c.checkBestReplacement(region, fit, rf) + return nil, nil } -func (c *RuleChecker) checkBestReplacement(region *core.RegionInfo, fit *placement.RegionFit, rf *placement.RuleFit) *operator.Operator { +func (c *RuleChecker) allowLeader(fit *placement.RegionFit, peer *metapb.Peer) bool { + if peer.GetIsLearner() { + return false + } + s := c.cluster.GetStore(peer.GetStoreId()) + if s == nil { + return false + } + stateFilter := filter.StoreStateFilter{ActionScope: "rule-checker", TransferLeader: true} + if stateFilter.Target(c.cluster, s) { + return false + } + for _, rf := range fit.RuleFits { + if (rf.Rule.Role == placement.Leader || rf.Rule.Role == placement.Voter) && + placement.MatchLabelConstraints(s, rf.Rule.LabelConstraints) { + return true + } + } + return false +} + +func (c *RuleChecker) fixBetterLocation(region *core.RegionInfo, fit *placement.RegionFit, rf *placement.RuleFit) (*operator.Operator, error) { if len(rf.Rule.LocationLabels) == 0 || rf.Rule.Count <= 1 { - return nil + return nil, nil } stores := getRuleFitStores(c.cluster, rf) s := selector.NewReplicaSelector(stores, rf.Rule.LocationLabels, filter.StoreStateFilter{ActionScope: "rule-checker", MoveRegion: true}) oldPeerStore := s.SelectSource(c.cluster, stores) if oldPeerStore == nil { - return nil + return nil, nil } oldPeer := region.GetStorePeer(oldPeerStore.GetID()) newPeerStore := SelectStoreToReplacePeerByRule("rule-checker", c.cluster, region, fit, rf, oldPeer) @@ -128,56 +175,18 @@ func (c *RuleChecker) checkBestReplacement(region *core.RegionInfo, fit *placeme newScore := core.DistinctScore(rf.Rule.LocationLabels, stores, newPeerStore) if newScore <= oldScore { log.Debug("no better peer", zap.Uint64("region-id", region.GetID()), zap.Float64("new-score", newScore), zap.Float64("old-score", oldScore)) - return nil + return nil, nil } newPeer := &metapb.Peer{StoreId: newPeerStore.GetID(), IsLearner: oldPeer.IsLearner} - op, err := operator.CreateMovePeerOperator("move-to-better-location", c.cluster, region, operator.OpReplica, oldPeer.GetStoreId(), newPeer) - if err != nil { - log.Debug("failed to create move-to-better-location operator", zap.Error(err)) - return nil - } - return op + return operator.CreateMovePeerOperator("move-to-better-location", c.cluster, region, operator.OpReplica, oldPeer.GetStoreId(), newPeer) } -func (c *RuleChecker) fixOrphanPeers(region *core.RegionInfo, fit *placement.RegionFit) *operator.Operator { +func (c *RuleChecker) fixOrphanPeers(region *core.RegionInfo, fit *placement.RegionFit) (*operator.Operator, error) { if len(fit.OrphanPeers) == 0 { - return nil + return nil, nil } peer := fit.OrphanPeers[0] - op, err := operator.CreateRemovePeerOperator("remove-orphan-peers", c.cluster, 0, region, peer.StoreId) - if err != nil { - log.Debug("failed to remove orphan peer", zap.Error(err)) - return nil - } - return op -} - -func (c *RuleChecker) addRulePeer(region *core.RegionInfo, rf *placement.RuleFit) *operator.Operator { - store := SelectStoreToAddPeerByRule(c.name, c.cluster, region, rf) - if store == nil { - return nil - } - peer := &metapb.Peer{StoreId: store.GetID(), IsLearner: rf.Rule.Role == placement.Learner} - op, err := operator.CreateAddPeerOperator("add-rule-peer", c.cluster, region, peer, operator.OpReplica) - if err != nil { - log.Debug("failed to create add rule peer operator", zap.Error(err)) - return nil - } - return op -} - -func (c *RuleChecker) replaceRulePeer(region *core.RegionInfo, fit *placement.RegionFit, rf *placement.RuleFit, peer *metapb.Peer) *operator.Operator { - store := SelectStoreToReplacePeerByRule(c.name, c.cluster, region, fit, rf, peer) - if store == nil { - return nil - } - newPeer := &metapb.Peer{StoreId: store.GetID(), IsLearner: rf.Rule.Role == placement.Learner} - op, err := operator.CreateMovePeerOperator("replace-rule-peer", c.cluster, region, operator.OpReplica, peer.StoreId, newPeer) - if err != nil { - log.Error("failed to move peer", zap.Error(err)) - return nil - } - return op + return operator.CreateRemovePeerOperator("remove-orphan-peers", c.cluster, 0, region, peer.StoreId) } func (c *RuleChecker) isDownPeer(region *core.RegionInfo, peer *metapb.Peer) bool { diff --git a/server/schedule/checker/rule_checker_test.go b/server/schedule/checker/rule_checker_test.go new file mode 100644 index 00000000000..f99832b2f85 --- /dev/null +++ b/server/schedule/checker/rule_checker_test.go @@ -0,0 +1,179 @@ +// Copyright 2019 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package checker + +import ( + . "github.com/pingcap/check" + "github.com/pingcap/kvproto/pkg/metapb" + "github.com/pingcap/kvproto/pkg/pdpb" + "github.com/pingcap/pd/pkg/mock/mockcluster" + "github.com/pingcap/pd/pkg/mock/mockoption" + "github.com/pingcap/pd/server/core" + "github.com/pingcap/pd/server/schedule/operator" + "github.com/pingcap/pd/server/schedule/placement" +) + +var _ = Suite(&testRuleCheckerSuite{}) + +type testRuleCheckerSuite struct { + cluster *mockcluster.Cluster + ruleManager *placement.RuleManager + rc *RuleChecker +} + +func (s *testRuleCheckerSuite) SetUpTest(c *C) { + cfg := mockoption.NewScheduleOptions() + cfg.EnablePlacementRules = true + s.cluster = mockcluster.NewCluster(cfg) + s.ruleManager = s.cluster.RuleManager + s.rc = NewRuleChecker(s.cluster, s.ruleManager) +} + +func (s *testRuleCheckerSuite) TestFixRange(c *C) { + s.cluster.AddLeaderStore(1, 1) + s.cluster.AddLeaderStore(2, 1) + s.cluster.AddLeaderStore(3, 1) + s.ruleManager.SetRule(&placement.Rule{ + GroupID: "test", + ID: "test", + StartKeyHex: "AA", + EndKeyHex: "FF", + Role: placement.Voter, + Count: 1, + }) + s.cluster.AddLeaderRegionWithRange(1, "", "", 1, 2, 3) + op := s.rc.Check(s.cluster.GetRegion(1)) + c.Assert(op, NotNil) + c.Assert(op.Len(), Equals, 1) + var split operator.SplitRegion + c.Assert(op.Step(0), FitsTypeOf, split) +} + +func (s *testRuleCheckerSuite) TestAddRulePeer(c *C) { + s.cluster.AddLeaderStore(1, 1) + s.cluster.AddLeaderStore(2, 1) + s.cluster.AddLeaderStore(3, 1) + s.cluster.AddLeaderRegionWithRange(1, "", "", 1, 2) + op := s.rc.Check(s.cluster.GetRegion(1)) + c.Assert(op, NotNil) + c.Assert(op.Desc(), Equals, "add-rule-peer") + c.Assert(op.Step(0).(operator.AddLearner).ToStore, Equals, uint64(3)) +} + +func (s *testRuleCheckerSuite) TestFixPeer(c *C) { + s.cluster.AddLeaderStore(1, 1) + s.cluster.AddLeaderStore(2, 1) + s.cluster.AddLeaderStore(3, 1) + s.cluster.AddLeaderStore(4, 1) + s.cluster.AddLeaderRegionWithRange(1, "", "", 1, 2, 3) + op := s.rc.Check(s.cluster.GetRegion(1)) + c.Assert(op, IsNil) + s.cluster.SetStoreDown(2) + r := s.cluster.GetRegion(1) + r = r.Clone(core.WithDownPeers([]*pdpb.PeerStats{{Peer: r.GetStorePeer(2), DownSeconds: 60000}})) + op = s.rc.Check(r) + c.Assert(op, NotNil) + c.Assert(op.Desc(), Equals, "replace-rule-peer") + var add operator.AddLearner + c.Assert(op.Step(0), FitsTypeOf, add) + s.cluster.SetStoreUp(2) + s.cluster.SetStoreOffline(2) + op = s.rc.Check(s.cluster.GetRegion(1)) + c.Assert(op, NotNil) + c.Assert(op.Desc(), Equals, "replace-rule-peer") + c.Assert(op.Step(0), FitsTypeOf, add) +} + +func (s *testRuleCheckerSuite) TestFixOrphanPeers(c *C) { + s.cluster.AddLeaderStore(1, 1) + s.cluster.AddLeaderStore(2, 1) + s.cluster.AddLeaderStore(3, 1) + s.cluster.AddLeaderStore(4, 1) + s.cluster.AddLeaderRegionWithRange(1, "", "", 1, 2, 3, 4) + op := s.rc.Check(s.cluster.GetRegion(1)) + c.Assert(op, NotNil) + c.Assert(op.Desc(), Equals, "remove-orphan-peers") + c.Assert(op.Step(0).(operator.RemovePeer).FromStore, Equals, uint64(4)) +} + +func (s *testRuleCheckerSuite) TestFixRole(c *C) { + s.cluster.AddLeaderStore(1, 1) + s.cluster.AddLeaderStore(2, 1) + s.cluster.AddLeaderStore(3, 1) + s.cluster.AddLeaderRegionWithRange(1, "", "", 2, 1, 3) + r := s.cluster.GetRegion(1) + p := r.GetStorePeer(1) + p.IsLearner = true + r = r.Clone(core.WithLearners([]*metapb.Peer{p})) + op := s.rc.Check(r) + c.Assert(op, NotNil) + c.Assert(op.Desc(), Equals, "fix-peer-role") + c.Assert(op.Step(0).(operator.PromoteLearner).ToStore, Equals, uint64(1)) +} + +func (s *testRuleCheckerSuite) TestFixRoleLeader(c *C) { + s.cluster.AddLabelsStore(1, 1, map[string]string{"role": "follower"}) + s.cluster.AddLabelsStore(2, 1, map[string]string{"role": "follower"}) + s.cluster.AddLabelsStore(3, 1, map[string]string{"role": "leader"}) + s.cluster.AddLeaderRegionWithRange(1, "", "", 1, 2, 3) + s.ruleManager.SetRule(&placement.Rule{ + GroupID: "pd", + ID: "r1", + Index: 100, + Override: true, + Role: placement.Leader, + Count: 1, + LabelConstraints: []placement.LabelConstraint{ + {Key: "role", Op: "in", Values: []string{"leader"}}, + }, + }) + s.ruleManager.SetRule(&placement.Rule{ + GroupID: "pd", + ID: "r2", + Index: 101, + Role: placement.Follower, + Count: 2, + LabelConstraints: []placement.LabelConstraint{ + {Key: "role", Op: "in", Values: []string{"follower"}}, + }, + }) + op := s.rc.Check(s.cluster.GetRegion(1)) + c.Assert(op, NotNil) + c.Assert(op.Desc(), Equals, "fix-peer-role") + c.Assert(op.Step(0).(operator.TransferLeader).ToStore, Equals, uint64(3)) +} + +func (s *testRuleCheckerSuite) TestBetterReplacement(c *C) { + s.cluster.AddLabelsStore(1, 1, map[string]string{"host": "host1"}) + s.cluster.AddLabelsStore(2, 1, map[string]string{"host": "host1"}) + s.cluster.AddLabelsStore(3, 1, map[string]string{"host": "host2"}) + s.cluster.AddLabelsStore(4, 1, map[string]string{"host": "host3"}) + s.cluster.AddLeaderRegionWithRange(1, "", "", 1, 2, 3) + s.ruleManager.SetRule(&placement.Rule{ + GroupID: "pd", + ID: "test", + Index: 100, + Override: true, + Role: placement.Voter, + Count: 3, + LocationLabels: []string{"host"}, + }) + op := s.rc.Check(s.cluster.GetRegion(1)) + c.Assert(op, NotNil) + c.Assert(op.Desc(), Equals, "move-to-better-location") + c.Assert(op.Step(0).(operator.AddLearner).ToStore, Equals, uint64(4)) + s.cluster.AddLeaderRegionWithRange(1, "", "", 1, 3, 4) + op = s.rc.Check(s.cluster.GetRegion(1)) + c.Assert(op, IsNil) +} From a8b468b42e676eeb9f025cd33472a7d154808e1f Mon Sep 17 00:00:00 2001 From: disksing Date: Fri, 20 Dec 2019 17:47:39 +0800 Subject: [PATCH 5/7] Update server/schedule/checker/rule_checker.go Signed-off-by: disksing Co-Authored-By: Ryan Leung --- server/schedule/checker/rule_checker.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/schedule/checker/rule_checker.go b/server/schedule/checker/rule_checker.go index 74fcc777d5c..5c3c3b10d48 100644 --- a/server/schedule/checker/rule_checker.go +++ b/server/schedule/checker/rule_checker.go @@ -34,7 +34,7 @@ type RuleChecker struct { name string } -// NewRuleChecker creates an checker instance. +// NewRuleChecker creates a checker instance. func NewRuleChecker(cluster opt.Cluster, ruleManager *placement.RuleManager) *RuleChecker { return &RuleChecker{ cluster: cluster, From 6bd96f28b945e1374cfb5740aa587dd4b2803ad7 Mon Sep 17 00:00:00 2001 From: disksing Date: Fri, 20 Dec 2019 18:03:11 +0800 Subject: [PATCH 6/7] address comments Signed-off-by: disksing --- server/schedule/checker/rule_checker.go | 21 +++++++++++++++----- server/schedule/checker/rule_checker_test.go | 12 ++++++----- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/server/schedule/checker/rule_checker.go b/server/schedule/checker/rule_checker.go index 5c3c3b10d48..9f01442f4fa 100644 --- a/server/schedule/checker/rule_checker.go +++ b/server/schedule/checker/rule_checker.go @@ -50,6 +50,7 @@ func (c *RuleChecker) Check(region *core.RegionInfo) *operator.Operator { fit := c.cluster.FitRegion(region) if len(fit.RuleFits) == 0 { + checkerCounter.WithLabelValues("rule_checker", "fix-range").Inc() // If the region matches no rules, the most possible reason is it spans across // multiple rules. return c.fixRange(region) @@ -87,8 +88,13 @@ func (c *RuleChecker) fixRulePeer(region *core.RegionInfo, fit *placement.Region } // fix down/offline peers. for _, peer := range rf.Peers { - if c.isDownPeer(region, peer) || c.isOfflinePeer(region, peer) { - return c.replaceRulePeer(region, fit, rf, peer) + if c.isDownPeer(region, peer) { + checkerCounter.WithLabelValues("rule_checker", "replace-down").Inc() + return c.replaceRulePeer(region, fit, rf, peer, downStatus) + } + if c.isOfflinePeer(region, peer) { + checkerCounter.WithLabelValues("rule_checker", "replace-offline").Inc() + return c.replaceRulePeer(region, fit, rf, peer, offlineStatus) } } // fix loose matched peers. @@ -105,6 +111,7 @@ func (c *RuleChecker) fixRulePeer(region *core.RegionInfo, fit *placement.Region } func (c *RuleChecker) addRulePeer(region *core.RegionInfo, rf *placement.RuleFit) (*operator.Operator, error) { + checkerCounter.WithLabelValues("rule_checker", "add-rule-peer").Inc() store := SelectStoreToAddPeerByRule(c.name, c.cluster, region, rf) if store == nil { return nil, errors.New("no store to add peer") @@ -113,20 +120,22 @@ func (c *RuleChecker) addRulePeer(region *core.RegionInfo, rf *placement.RuleFit return operator.CreateAddPeerOperator("add-rule-peer", c.cluster, region, peer, operator.OpReplica) } -func (c *RuleChecker) replaceRulePeer(region *core.RegionInfo, fit *placement.RegionFit, rf *placement.RuleFit, peer *metapb.Peer) (*operator.Operator, error) { +func (c *RuleChecker) replaceRulePeer(region *core.RegionInfo, fit *placement.RegionFit, rf *placement.RuleFit, peer *metapb.Peer, status string) (*operator.Operator, error) { store := SelectStoreToReplacePeerByRule(c.name, c.cluster, region, fit, rf, peer) if store == nil { return nil, errors.New("no store to add peer") } newPeer := &metapb.Peer{StoreId: store.GetID(), IsLearner: rf.Rule.Role == placement.Learner} - return operator.CreateMovePeerOperator("replace-rule-peer", c.cluster, region, operator.OpReplica, peer.StoreId, newPeer) + return operator.CreateMovePeerOperator("replace-rule-"+status+"-peer", c.cluster, region, operator.OpReplica, peer.StoreId, newPeer) } func (c *RuleChecker) fixLooseMatchPeer(region *core.RegionInfo, fit *placement.RegionFit, rf *placement.RuleFit, peer *metapb.Peer) (*operator.Operator, error) { if peer.IsLearner && rf.Rule.Role != placement.Learner { + checkerCounter.WithLabelValues("rule_checker", "fix-peer-role").Inc() return operator.CreatePromoteLearnerOperator("fix-peer-role", c.cluster, region, peer) } if region.GetLeader().GetId() == peer.GetId() && rf.Rule.Role == placement.Follower { + checkerCounter.WithLabelValues("rule_checker", "fix-leader-role").Inc() for _, p := range region.GetPeers() { if c.allowLeader(fit, p) { return operator.CreateTransferLeaderOperator("fix-peer-role", c.cluster, region, peer.GetStoreId(), p.GetStoreId(), 0) @@ -177,6 +186,7 @@ func (c *RuleChecker) fixBetterLocation(region *core.RegionInfo, fit *placement. log.Debug("no better peer", zap.Uint64("region-id", region.GetID()), zap.Float64("new-score", newScore), zap.Float64("old-score", oldScore)) return nil, nil } + checkerCounter.WithLabelValues("rule_checker", "move-to-better-location").Inc() newPeer := &metapb.Peer{StoreId: newPeerStore.GetID(), IsLearner: oldPeer.IsLearner} return operator.CreateMovePeerOperator("move-to-better-location", c.cluster, region, operator.OpReplica, oldPeer.GetStoreId(), newPeer) } @@ -185,8 +195,9 @@ func (c *RuleChecker) fixOrphanPeers(region *core.RegionInfo, fit *placement.Reg if len(fit.OrphanPeers) == 0 { return nil, nil } + checkerCounter.WithLabelValues("rule_checker", "remove-orphan-peer").Inc() peer := fit.OrphanPeers[0] - return operator.CreateRemovePeerOperator("remove-orphan-peers", c.cluster, 0, region, peer.StoreId) + return operator.CreateRemovePeerOperator("remove-orphan-peer", c.cluster, 0, region, peer.StoreId) } func (c *RuleChecker) isDownPeer(region *core.RegionInfo, peer *metapb.Peer) bool { diff --git a/server/schedule/checker/rule_checker_test.go b/server/schedule/checker/rule_checker_test.go index f99832b2f85..f794f30e9fc 100644 --- a/server/schedule/checker/rule_checker_test.go +++ b/server/schedule/checker/rule_checker_test.go @@ -14,6 +14,7 @@ package checker import ( + "encoding/hex" . "github.com/pingcap/check" "github.com/pingcap/kvproto/pkg/metapb" "github.com/pingcap/kvproto/pkg/pdpb" @@ -56,8 +57,9 @@ func (s *testRuleCheckerSuite) TestFixRange(c *C) { op := s.rc.Check(s.cluster.GetRegion(1)) c.Assert(op, NotNil) c.Assert(op.Len(), Equals, 1) - var split operator.SplitRegion - c.Assert(op.Step(0), FitsTypeOf, split) + splitKeys := op.Step(0).(operator.SplitRegion).SplitKeys + c.Assert(hex.EncodeToString(splitKeys[0]), Equals, "aa") + c.Assert(hex.EncodeToString(splitKeys[1]), Equals, "ff") } func (s *testRuleCheckerSuite) TestAddRulePeer(c *C) { @@ -84,14 +86,14 @@ func (s *testRuleCheckerSuite) TestFixPeer(c *C) { r = r.Clone(core.WithDownPeers([]*pdpb.PeerStats{{Peer: r.GetStorePeer(2), DownSeconds: 60000}})) op = s.rc.Check(r) c.Assert(op, NotNil) - c.Assert(op.Desc(), Equals, "replace-rule-peer") + c.Assert(op.Desc(), Equals, "replace-rule-down-peer") var add operator.AddLearner c.Assert(op.Step(0), FitsTypeOf, add) s.cluster.SetStoreUp(2) s.cluster.SetStoreOffline(2) op = s.rc.Check(s.cluster.GetRegion(1)) c.Assert(op, NotNil) - c.Assert(op.Desc(), Equals, "replace-rule-peer") + c.Assert(op.Desc(), Equals, "replace-rule-offline-peer") c.Assert(op.Step(0), FitsTypeOf, add) } @@ -103,7 +105,7 @@ func (s *testRuleCheckerSuite) TestFixOrphanPeers(c *C) { s.cluster.AddLeaderRegionWithRange(1, "", "", 1, 2, 3, 4) op := s.rc.Check(s.cluster.GetRegion(1)) c.Assert(op, NotNil) - c.Assert(op.Desc(), Equals, "remove-orphan-peers") + c.Assert(op.Desc(), Equals, "remove-orphan-peer") c.Assert(op.Step(0).(operator.RemovePeer).FromStore, Equals, uint64(4)) } From 1d8d4a18e9653c6783e4324a2833eda6f98c5580 Mon Sep 17 00:00:00 2001 From: disksing Date: Tue, 24 Dec 2019 17:44:58 +0800 Subject: [PATCH 7/7] address comments Signed-off-by: disksing --- server/schedule/checker/rule_checker.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/server/schedule/checker/rule_checker.go b/server/schedule/checker/rule_checker.go index 9f01442f4fa..569f7068393 100644 --- a/server/schedule/checker/rule_checker.go +++ b/server/schedule/checker/rule_checker.go @@ -114,6 +114,7 @@ func (c *RuleChecker) addRulePeer(region *core.RegionInfo, rf *placement.RuleFit checkerCounter.WithLabelValues("rule_checker", "add-rule-peer").Inc() store := SelectStoreToAddPeerByRule(c.name, c.cluster, region, rf) if store == nil { + checkerCounter.WithLabelValues("rule_checker", "no-store-add").Inc() return nil, errors.New("no store to add peer") } peer := &metapb.Peer{StoreId: store.GetID(), IsLearner: rf.Rule.Role == placement.Learner} @@ -123,7 +124,8 @@ func (c *RuleChecker) addRulePeer(region *core.RegionInfo, rf *placement.RuleFit func (c *RuleChecker) replaceRulePeer(region *core.RegionInfo, fit *placement.RegionFit, rf *placement.RuleFit, peer *metapb.Peer, status string) (*operator.Operator, error) { store := SelectStoreToReplacePeerByRule(c.name, c.cluster, region, fit, rf, peer) if store == nil { - return nil, errors.New("no store to add peer") + checkerCounter.WithLabelValues("rule_checker", "no-store-replace").Inc() + return nil, errors.New("no store to replace peer") } newPeer := &metapb.Peer{StoreId: store.GetID(), IsLearner: rf.Rule.Role == placement.Learner} return operator.CreateMovePeerOperator("replace-rule-"+status+"-peer", c.cluster, region, operator.OpReplica, peer.StoreId, newPeer) @@ -141,6 +143,7 @@ func (c *RuleChecker) fixLooseMatchPeer(region *core.RegionInfo, fit *placement. return operator.CreateTransferLeaderOperator("fix-peer-role", c.cluster, region, peer.GetStoreId(), p.GetStoreId(), 0) } } + checkerCounter.WithLabelValues("rule_checker", "no-new-leader").Inc() return nil, errors.New("no new leader") } return nil, nil