diff --git a/tests/cmd/stability/main.go b/tests/cmd/stability/main.go index e26177b88d1..199a486bb60 100644 --- a/tests/cmd/stability/main.go +++ b/tests/cmd/stability/main.go @@ -39,6 +39,8 @@ func main() { conf := tests.ParseConfigOrDie() cli, kubeCli := client.NewCliOrDie() oa := tests.NewOperatorActions(cli, kubeCli, conf) + fta := tests.NewFaultTriggerAction(cli, kubeCli, conf) + fta.CheckAndRecoverEnvOrDie() tidbVersion := conf.GetTiDBVersionOrDie() upgardeTiDBVersions := conf.GetUpgradeTidbVersionsOrDie() @@ -203,7 +205,6 @@ func main() { backup.NewBackupCase(oa, clusterBackupFrom, clusterRestoreTo).RunOrDie() // stop a node and failover automatically - fta := tests.NewFaultTriggerAction(cli, kubeCli, conf) physicalNode, node, faultTime := fta.StopNodeOrDie() oa.CheckFailoverPendingOrDie(allClusters, &faultTime) oa.CheckFailoverOrDie(allClusters, node) diff --git a/tests/fault.go b/tests/fault.go index 4f8e75bea9b..0f97efdf26d 100644 --- a/tests/fault.go +++ b/tests/fault.go @@ -23,6 +23,8 @@ const ( ) type FaultTriggerActions interface { + CheckAndRecoverEnv() error + CheckAndRecoverEnvOrDie() StopNode() (string, string, time.Time, error) StopNodeOrDie() (string, string, time.Time) StartNode(physicalNode string, node string) error @@ -62,6 +64,57 @@ type faultTriggerActions struct { cfg *Config } +func (fa *faultTriggerActions) CheckAndRecoverEnv() error { + glog.Infof("ensure all nodes are running") + for _, physicalNode := range fa.cfg.Nodes { + for _, vNode := range physicalNode.Nodes { + err := fa.StartNode(physicalNode.PhysicalNode, vNode) + if err != nil { + return err + } + } + } + glog.Infof("ensure all etcds are running") + err := fa.StartETCD() + if err != nil { + return err + } + glog.Infof("ensure all kubelets are running") + for _, physicalNode := range fa.cfg.Nodes { + for _, vNode := range physicalNode.Nodes { + err := fa.StartKubelet(vNode) + if err != nil { + return err + } + } + } + glog.Infof("ensure all static pods are running") + for _, physicalNode := range fa.cfg.APIServers { + for _, vNode := range physicalNode.Nodes { + err := fa.StartKubeAPIServer(vNode) + if err != nil { + return err + } + err = fa.StartKubeControllerManager(vNode) + if err != nil { + return err + } + err = fa.StartKubeScheduler(vNode) + if err != nil { + return err + } + } + } + + return nil +} + +func (fa *faultTriggerActions) CheckAndRecoverEnvOrDie() { + if err:=fa.CheckAndRecoverEnv();err!=nil{ + glog.Fatal(err) + } +} + func (fa *faultTriggerActions) StopNode() (string, string, time.Time, error) { now := time.Now() node, err := getFaultNode(fa.kubeCli) diff --git a/tests/pkg/fault-trigger/manager/static_pod_service.go b/tests/pkg/fault-trigger/manager/static_pod_service.go index f30c38e6d1b..b58e47cf89d 100644 --- a/tests/pkg/fault-trigger/manager/static_pod_service.go +++ b/tests/pkg/fault-trigger/manager/static_pod_service.go @@ -15,6 +15,7 @@ package manager import ( "fmt" + "os" "os/exec" "github.com/golang/glog" @@ -74,6 +75,10 @@ func (m *Manager) StopKubeControllerManager() error { func (m *Manager) stopStaticPodService(serviceName string, fileName string) error { maniest := fmt.Sprintf("%s/%s", staticPodPath, fileName) + if _, err := os.Stat(maniest); os.IsNotExist(err) { + glog.Infof("%s had been stopped before", serviceName) + return nil + } shell := fmt.Sprintf("mkdir -p %s && mv %s %s", staticPodTmpPath, maniest, staticPodTmpPath) cmd := exec.Command("/bin/sh", "-c", shell) @@ -90,6 +95,10 @@ func (m *Manager) stopStaticPodService(serviceName string, fileName string) erro func (m *Manager) startStaticPodService(serviceName string, fileName string) error { maniest := fmt.Sprintf("%s/%s", staticPodTmpPath, fileName) + if _, err := os.Stat(maniest); os.IsNotExist(err) { + glog.Infof("%s had been started before", serviceName) + return nil + } shell := fmt.Sprintf("mv %s %s", maniest, staticPodPath) cmd := exec.Command("/bin/sh", "-c", shell)