Skip to content

Commit

Permalink
Validation: treat as error if insufficient nodes
Browse files Browse the repository at this point in the history
We switch to use the rolling update logic, which gives us nodes by
InstanceGroup.
  • Loading branch information
justinsb committed Mar 18, 2018
1 parent 98ba08f commit 1de1541
Show file tree
Hide file tree
Showing 11 changed files with 406 additions and 865 deletions.
2 changes: 1 addition & 1 deletion cmd/kops/rollingupdatecluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -380,5 +380,5 @@ func RunRollingUpdateCluster(f *util.Factory, out io.Writer, options *RollingUpd
PostDrainDelay: options.PostDrainDelay,
ValidationTimeout: options.ValidationTimeout,
}
return d.RollingUpdate(groups, list)
return d.RollingUpdate(groups, cluster, list)
}
201 changes: 71 additions & 130 deletions cmd/kops/validate_cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,6 @@ import (
"k8s.io/client-go/tools/clientcmd"
"k8s.io/kops/cmd/kops/util"
api "k8s.io/kops/pkg/apis/kops"
apiutil "k8s.io/kops/pkg/apis/kops/util"
"k8s.io/kops/pkg/dns"
"k8s.io/kops/pkg/validation"
"k8s.io/kops/util/pkg/tables"
)
Expand Down Expand Up @@ -64,10 +62,15 @@ func NewCmdValidateCluster(f *util.Factory, out io.Writer) *cobra.Command {
Long: validateLong,
Example: validateExample,
Run: func(cmd *cobra.Command, args []string) {
err := RunValidateCluster(f, cmd, args, os.Stdout, options)
result, err := RunValidateCluster(f, cmd, args, os.Stdout, options)
if err != nil {
exitWithError(err)
}
// We want the validate command to exit non-zero if validation found a problem,
// even if we didn't really hit an error during validation.
if len(result.Failures) != 0 {
os.Exit(2)
}
},
}

Expand All @@ -76,25 +79,25 @@ func NewCmdValidateCluster(f *util.Factory, out io.Writer) *cobra.Command {
return cmd
}

func RunValidateCluster(f *util.Factory, cmd *cobra.Command, args []string, out io.Writer, options *ValidateClusterOptions) error {
func RunValidateCluster(f *util.Factory, cmd *cobra.Command, args []string, out io.Writer, options *ValidateClusterOptions) (*validation.ValidationCluster, error) {
err := rootCommand.ProcessArgs(args)
if err != nil {
return err
return nil, err
}

cluster, err := rootCommand.Cluster()
if err != nil {
return err
return nil, err
}

clientSet, err := f.Clientset()
if err != nil {
return err
return nil, err
}

list, err := clientSet.InstanceGroupsFor(cluster).List(metav1.ListOptions{})
if err != nil {
return fmt.Errorf("cannot get InstanceGroups for %q: %v", cluster.ObjectMeta.Name, err)
return nil, fmt.Errorf("cannot get InstanceGroups for %q: %v", cluster.ObjectMeta.Name, err)
}

if options.output == OutputTable {
Expand All @@ -108,7 +111,7 @@ func RunValidateCluster(f *util.Factory, cmd *cobra.Command, args []string, out
}

if len(instanceGroups) == 0 {
return fmt.Errorf("no InstanceGroup objects found\n")
return nil, fmt.Errorf("no InstanceGroup objects found")
}

// TODO: Refactor into util.Factory
Expand All @@ -117,70 +120,52 @@ func RunValidateCluster(f *util.Factory, cmd *cobra.Command, args []string, out
clientcmd.NewDefaultClientConfigLoadingRules(),
&clientcmd.ConfigOverrides{CurrentContext: contextName}).ClientConfig()
if err != nil {
return fmt.Errorf("Cannot load kubecfg settings for %q: %v\n", contextName, err)
return nil, fmt.Errorf("Cannot load kubecfg settings for %q: %v", contextName, err)
}

k8sClient, err := kubernetes.NewForConfig(config)
if err != nil {
return fmt.Errorf("Cannot build kube api client for %q: %v\n", contextName, err)
return nil, fmt.Errorf("Cannot build kubernetes api client for %q: %v", contextName, err)
}

// Do not use if we are running gossip
if !dns.IsGossipHostname(cluster.ObjectMeta.Name) {
// TODO we may want to return validation.ValidationCluster instead of building it later on
hasPlaceHolderIPAddress, err := validation.HasPlaceHolderIP(contextName)
if err != nil {
return err
}

if hasPlaceHolderIPAddress {
message := "Validation Failed\n\n" +
"The dns-controller Kubernetes deployment has not updated the Kubernetes cluster's API DNS entry to the correct IP address." +
" The API DNS IP address is the placeholder address that kops creates: 203.0.113.123." +
" Please wait about 5-10 minutes for a master to start, dns-controller to launch, and DNS to propagate." +
" The protokube container and dns-controller deployment logs may contain more diagnostic information." +
" Etcd and the API DNS entries must be updated for a kops Kubernetes cluster to start."
validationCluster := &validation.ValidationCluster{
ClusterName: cluster.ObjectMeta.Name,
ErrorMessage: message,
Status: validation.ClusterValidationFailed,
}
validationFailed := fmt.Errorf("\nCannot reach cluster's API server: unable to Validate Cluster: %s", cluster.ObjectMeta.Name)
switch options.output {
case OutputTable:
fmt.Println(message)
return validationFailed
case OutputYaml:
return validateClusterOutputYAML(validationCluster, validationFailed, out)
case OutputJSON:
return validateClusterOutputJSON(validationCluster, validationFailed, out)
default:
return fmt.Errorf("Unknown output format: %q", options.output)
}

}
}

validationCluster, validationFailed := validation.ValidateCluster(cluster.ObjectMeta.Name, list, k8sClient)

if validationCluster == nil || validationCluster.NodeList == nil || validationCluster.NodeList.Items == nil {
return validationFailed
result, err := validation.ValidateCluster(cluster, list, k8sClient)
if err != nil {
return nil, fmt.Errorf("unexpected error during validation: %v", err)
}

switch options.output {
case OutputTable:
return validateClusterOutputTable(validationCluster, validationFailed, instanceGroups, out)
if err := validateClusterOutputTable(result, cluster, instanceGroups, out); err != nil {
return nil, err
}

case OutputYaml:
return validateClusterOutputYAML(validationCluster, validationFailed, out)
y, err := yaml.Marshal(result)
if err != nil {
return nil, fmt.Errorf("unable to marshal YAML: %v", err)
}
if _, err := out.Write(y); err != nil {
return nil, fmt.Errorf("error writing to output: %v", err)
}

case OutputJSON:
return validateClusterOutputJSON(validationCluster, validationFailed, out)
j, err := json.Marshal(result)
if err != nil {
return nil, fmt.Errorf("unable to marshall JSON: %v", err)
}
if _, err := out.Write(j); err != nil {
return nil, fmt.Errorf("error writing JSON: %v", err)
}

default:
return fmt.Errorf("Unknown output format: %q", options.output)
return nil, fmt.Errorf("Unknown output format: %q", options.output)
}

return result, nil

}

func validateClusterOutputTable(validationCluster *validation.ValidationCluster, validationFailed error, instanceGroups []api.InstanceGroup, out io.Writer) error {
func validateClusterOutputTable(result *validation.ValidationCluster, cluster *api.Cluster, instanceGroups []api.InstanceGroup, out io.Writer) error {
t := &tables.Table{}
t.AddColumn("NAME", func(c api.InstanceGroup) string {
return c.ObjectMeta.Name
Expand All @@ -205,96 +190,52 @@ func validateClusterOutputTable(validationCluster *validation.ValidationCluster,
err := t.Render(instanceGroups, out, "NAME", "ROLE", "MACHINETYPE", "MIN", "MAX", "SUBNETS")

if err != nil {
return fmt.Errorf("cannot render nodes for %q: %v", validationCluster.ClusterName, err)
return fmt.Errorf("cannot render nodes for %q: %v", cluster.Name, err)
}

nodeTable := &tables.Table{}

nodeTable.AddColumn("NAME", func(n v1.Node) string {
return n.Name
})

nodeTable.AddColumn("READY", func(n v1.Node) v1.ConditionStatus {
return validation.GetNodeConditionStatus(&n)
})

nodeTable.AddColumn("ROLE", func(n v1.Node) string {
// TODO: Maybe print the instance group role instead?
// TODO: Maybe include the instance group name?
role := apiutil.GetNodeRole(&n)
if role == "" {
role = "node"
}
return role
})

fmt.Fprintln(out, "\nNODE STATUS")
err = nodeTable.Render(validationCluster.NodeList.Items, out, "NAME", "ROLE", "READY")

if err != nil {
return fmt.Errorf("cannot render nodes for %q: %v", validationCluster.ClusterName, err)
}
{
nodeTable := &tables.Table{}
nodeTable.AddColumn("NAME", func(n *validation.ValidationNode) string {
return n.Name
})

if len(validationCluster.ComponentFailures) != 0 {
componentFailuresTable := &tables.Table{}
componentFailuresTable.AddColumn("NAME", func(s string) string {
return s
nodeTable.AddColumn("READY", func(n *validation.ValidationNode) v1.ConditionStatus {
return n.Status
})

fmt.Fprintln(out, "\nComponent Failures")
err = componentFailuresTable.Render(validationCluster.ComponentFailures, out, "NAME")
nodeTable.AddColumn("ROLE", func(n *validation.ValidationNode) string {
return n.Role
})

if err != nil {
return fmt.Errorf("cannot render components for %q: %v", validationCluster.ClusterName, err)
fmt.Fprintln(out, "\nNODE STATUS")
if err := nodeTable.Render(result.Nodes, out, "NAME", "ROLE", "READY"); err != nil {
return fmt.Errorf("cannot render nodes for %q: %v", cluster.Name, err)
}
}

if len(validationCluster.PodFailures) != 0 {
podFailuresTable := &tables.Table{}
podFailuresTable.AddColumn("NAME", func(s string) string {
return s
if len(result.Failures) != 0 {
failuresTable := &tables.Table{}
failuresTable.AddColumn("KIND", func(e *validation.ValidationError) string {
return e.Kind
})
failuresTable.AddColumn("NAME", func(e *validation.ValidationError) string {
return e.Name
})
failuresTable.AddColumn("MESSAGE", func(e *validation.ValidationError) string {
return e.Message
})

fmt.Fprintln(out, "\nPod Failures in kube-system")
err = podFailuresTable.Render(validationCluster.PodFailures, out, "NAME")

if err != nil {
return fmt.Errorf("cannot render pods for %q: %v", validationCluster.ClusterName, err)
fmt.Fprintln(out, "\nVALIDATION ERRORS")
if err := failuresTable.Render(result.Failures, out, "KIND", "NAME", "MESSAGE"); err != nil {
return fmt.Errorf("error rendering failures table: %v", err)
}
}

if validationFailed == nil {
fmt.Fprintf(out, "\nYour cluster %s is ready\n", validationCluster.ClusterName)
return nil
if len(result.Failures) == 0 {
fmt.Fprintf(out, "\nYour cluster %s is ready\n", cluster.Name)
} else {
// do we need to print which instance group is not ready?
// nodes are going to be a pain
fmt.Fprint(out, "\nValidation Failed\n")
fmt.Fprintf(out, "Ready Master(s) %d out of %d.\n", len(validationCluster.MastersReadyArray), validationCluster.MastersCount)
fmt.Fprintf(out, "Ready Node(s) %d out of %d.\n", len(validationCluster.NodesReadyArray), validationCluster.NodesCount)
return validationFailed
}
}

func validateClusterOutputYAML(validationCluster *validation.ValidationCluster, validationFailed error, out io.Writer) error {
y, err := yaml.Marshal(validationCluster)
if err != nil {
return fmt.Errorf("unable to marshall YAML: %v\n", err)
}
return validateOutput(y, validationFailed, out)
}

func validateClusterOutputJSON(validationCluster *validation.ValidationCluster, validationFailed error, out io.Writer) error {
j, err := json.Marshal(validationCluster)
if err != nil {
return fmt.Errorf("unable to marshall JSON: %v\n", err)
}
return validateOutput(j, validationFailed, out)
}

func validateOutput(b []byte, validationFailed error, out io.Writer) error {
if _, err := out.Write(b); err != nil {
return fmt.Errorf("unable to print data: %v\n", err)
}
return validationFailed
return nil
}
23 changes: 11 additions & 12 deletions pkg/instancegroups/instancegroups.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ func promptInteractive(upgradedHost string) (stopPrompting bool, err error) {
// TODO: Batch termination, like a rolling-update

// RollingUpdate performs a rolling update on a list of ec2 instances.
func (r *RollingUpdateInstanceGroup) RollingUpdate(rollingUpdateData *RollingUpdateCluster, instanceGroupList *api.InstanceGroupList, isBastion bool, sleepAfterTerminate time.Duration, validationTimeout time.Duration) (err error) {
func (r *RollingUpdateInstanceGroup) RollingUpdate(rollingUpdateData *RollingUpdateCluster, cluster *api.Cluster, instanceGroupList *api.InstanceGroupList, isBastion bool, sleepAfterTerminate time.Duration, validationTimeout time.Duration) (err error) {

// we should not get here, but hey I am going to check.
if rollingUpdateData == nil {
Expand Down Expand Up @@ -127,7 +127,7 @@ func (r *RollingUpdateInstanceGroup) RollingUpdate(rollingUpdateData *RollingUpd
} else if rollingUpdateData.CloudOnly {
glog.V(3).Info("Not validating cluster as validation is turned off via the cloud-only flag.")
} else if featureflag.DrainAndValidateRollingUpdate.Enabled() {
if err = r.ValidateCluster(rollingUpdateData, instanceGroupList); err != nil {
if err = r.ValidateCluster(rollingUpdateData, cluster, instanceGroupList); err != nil {
if rollingUpdateData.FailOnValidate {
return fmt.Errorf("error validating cluster: %v", err)
} else {
Expand Down Expand Up @@ -187,7 +187,7 @@ func (r *RollingUpdateInstanceGroup) RollingUpdate(rollingUpdateData *RollingUpd
} else if featureflag.DrainAndValidateRollingUpdate.Enabled() {
glog.Infof("Validating the cluster.")

if err = r.ValidateClusterWithDuration(rollingUpdateData, instanceGroupList, validationTimeout); err != nil {
if err = r.ValidateClusterWithDuration(rollingUpdateData, cluster, instanceGroupList, validationTimeout); err != nil {

if rollingUpdateData.FailOnValidate {
glog.Errorf("Cluster did not validate within %s", validationTimeout)
Expand All @@ -213,12 +213,12 @@ func (r *RollingUpdateInstanceGroup) RollingUpdate(rollingUpdateData *RollingUpd
}

// ValidateClusterWithDuration runs validation.ValidateCluster until either we get positive result or the timeout expires
func (r *RollingUpdateInstanceGroup) ValidateClusterWithDuration(rollingUpdateData *RollingUpdateCluster, instanceGroupList *api.InstanceGroupList, duration time.Duration) error {
func (r *RollingUpdateInstanceGroup) ValidateClusterWithDuration(rollingUpdateData *RollingUpdateCluster, cluster *api.Cluster, instanceGroupList *api.InstanceGroupList, duration time.Duration) error {
// TODO should we expose this to the UI?
tickDuration := 30 * time.Second
// Try to validate cluster at least once, this will handle durations that are lower
// than our tick time
if r.tryValidateCluster(rollingUpdateData, instanceGroupList, duration, tickDuration) {
if r.tryValidateCluster(rollingUpdateData, cluster, instanceGroupList, duration, tickDuration) {
return nil
}

Expand All @@ -232,7 +232,7 @@ func (r *RollingUpdateInstanceGroup) ValidateClusterWithDuration(rollingUpdateDa
return fmt.Errorf("cluster did not validate within a duation of %q", duration)
case <-tick:
// Got a tick, validate cluster
if r.tryValidateCluster(rollingUpdateData, instanceGroupList, duration, tickDuration) {
if r.tryValidateCluster(rollingUpdateData, cluster, instanceGroupList, duration, tickDuration) {
return nil
}
// ValidateCluster didn't work yet, so let's try again
Expand All @@ -241,8 +241,8 @@ func (r *RollingUpdateInstanceGroup) ValidateClusterWithDuration(rollingUpdateDa
}
}

func (r *RollingUpdateInstanceGroup) tryValidateCluster(rollingUpdateData *RollingUpdateCluster, instanceGroupList *api.InstanceGroupList, duration time.Duration, tickDuration time.Duration) bool {
if _, err := validation.ValidateCluster(rollingUpdateData.ClusterName, instanceGroupList, rollingUpdateData.K8sClient); err != nil {
func (r *RollingUpdateInstanceGroup) tryValidateCluster(rollingUpdateData *RollingUpdateCluster, cluster *api.Cluster, instanceGroupList *api.InstanceGroupList, duration time.Duration, tickDuration time.Duration) bool {
if _, err := validation.ValidateCluster(cluster, instanceGroupList, rollingUpdateData.K8sClient); err != nil {
glog.Infof("Cluster did not validate, will try again in %q until duration %q expires: %v.", tickDuration, duration, err)
return false
} else {
Expand All @@ -252,10 +252,9 @@ func (r *RollingUpdateInstanceGroup) tryValidateCluster(rollingUpdateData *Rolli
}

// ValidateCluster runs our validation methods on the K8s Cluster.
func (r *RollingUpdateInstanceGroup) ValidateCluster(rollingUpdateData *RollingUpdateCluster, instanceGroupList *api.InstanceGroupList) error {

if _, err := validation.ValidateCluster(rollingUpdateData.ClusterName, instanceGroupList, rollingUpdateData.K8sClient); err != nil {
return fmt.Errorf("cluster %q did not pass validation: %v", rollingUpdateData.ClusterName, err)
func (r *RollingUpdateInstanceGroup) ValidateCluster(rollingUpdateData *RollingUpdateCluster, cluster *api.Cluster, instanceGroupList *api.InstanceGroupList) error {
if _, err := validation.ValidateCluster(cluster, instanceGroupList, rollingUpdateData.K8sClient); err != nil {
return fmt.Errorf("cluster %q did not pass validation: %v", cluster.Name, err)
}

return nil
Expand Down
Loading

0 comments on commit 1de1541

Please sign in to comment.