-
Notifications
You must be signed in to change notification settings - Fork 165
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Kubevirt: Defer eve reboot/shutdown/update until drain completes
As a part of kubevirt-eve we have multiple cluster nodes each hosting app workloads and volume replicas. This implements defer for eve mgmt operations which will result in unavailability of storage replicas. An example: 1. Node 1 outage and recovers. 2. Before volumes complete rebuilding on node 1: Node 2 outage and recovery. 3. Volumes begin rebuilding replicas on nodes 1 and 2. 4. User initiated request to reboot/shutdown/update eve-os on node3. 5. That config request is set to defer until replicas are rebuilt on the other nodes. Signed-off-by: Andrew Durbin <[email protected]>
- Loading branch information
1 parent
0769a20
commit 432f137
Showing
17 changed files
with
989 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
// Copyright (c) 2024 Zededa, Inc. | ||
// SPDX-License-Identifier: Apache-2.0 | ||
|
||
package baseosmgr | ||
|
||
import ( | ||
"github.com/lf-edge/eve/pkg/pillar/kubeapi" | ||
"github.com/lf-edge/eve/pkg/pillar/pubsub" | ||
"github.com/lf-edge/eve/pkg/pillar/types" | ||
) | ||
|
||
func handleNodeDrainStatusCreate(ctxArg interface{}, key string, | ||
configArg interface{}) { | ||
handleNodeDrainStatusImpl(ctxArg, key, configArg, nil) | ||
} | ||
|
||
func handleNodeDrainStatusModify(ctxArg interface{}, key string, | ||
configArg interface{}, oldConfigArg interface{}) { | ||
handleNodeDrainStatusImpl(ctxArg, key, configArg, oldConfigArg) | ||
} | ||
|
||
func handleNodeDrainStatusImpl(ctxArg interface{}, _ string, | ||
configArg interface{}, _ interface{}) { | ||
newStatus, ok := configArg.(kubeapi.NodeDrainStatus) | ||
if !ok { | ||
log.Fatalf("handleNodeDrainStatusImpl invalid type in configArg: %v", configArg) | ||
} | ||
ctx, ok := ctxArg.(*baseOsMgrContext) | ||
if !ok { | ||
log.Fatalf("handleNodeDrainStatusImpl invalid type in ctxArg: %v", ctxArg) | ||
} | ||
|
||
if newStatus.RequestedBy != kubeapi.UPDATE { | ||
return | ||
} | ||
|
||
log.Functionf("handleNodeDrainStatusImpl to:%v", newStatus) | ||
if (newStatus.Status == kubeapi.FAILEDCORDON) || | ||
(newStatus.Status == kubeapi.FAILEDDRAIN) { | ||
log.Errorf("handleNodeDrainStatusImpl nodedrain-step:drain-failed-handler unpublish NodeDrainRequest due to NodeDrainStatus:%v", newStatus) | ||
if err := ctx.pubNodeDrainRequest.Unpublish("global"); err != nil { | ||
log.Errorf("Unable to remove NodeDrainRequest object:%v", err) | ||
} | ||
} | ||
if newStatus.Status == kubeapi.COMPLETE { | ||
id := ctx.deferredBaseOsID | ||
if id != "" { | ||
log.Noticef("handleNodeDrainStatusImpl nodedrain-step:drain-complete-handler, continuing baseosstatus update id:%s", id) | ||
baseOsHandleStatusUpdateUUID(ctx, id) | ||
} | ||
} | ||
} | ||
|
||
func handleNodeDrainStatusDelete(_ interface{}, _ string, | ||
_ interface{}) { | ||
log.Function("handleNodeDrainStatusDelete") | ||
} | ||
|
||
func initNodeDrainPubSub(ps *pubsub.PubSub, ctx *baseOsMgrContext) { | ||
subNodeDrainStatus, err := ps.NewSubscription(pubsub.SubscriptionOptions{ | ||
AgentName: "zedkube", | ||
MyAgentName: agentName, | ||
TopicImpl: kubeapi.NodeDrainStatus{}, | ||
Persistent: false, | ||
Activate: false, | ||
Ctx: ctx, | ||
CreateHandler: handleNodeDrainStatusCreate, | ||
ModifyHandler: handleNodeDrainStatusModify, | ||
DeleteHandler: handleNodeDrainStatusDelete, | ||
WarningTime: warningTime, | ||
ErrorTime: errorTime, | ||
}) | ||
if err != nil { | ||
log.Fatalf("initNodeDrainPubSub subNodeDrainStatus err:%v", err) | ||
return | ||
} | ||
if err := subNodeDrainStatus.Activate(); err != nil { | ||
log.Fatalf("initNodeDrainPubSub can't activate sub:%v", err) | ||
} | ||
|
||
pubNodeDrainRequest, err := ps.NewPublication( | ||
pubsub.PublicationOptions{ | ||
AgentName: agentName, | ||
TopicType: kubeapi.NodeDrainRequest{}, | ||
}) | ||
if err != nil { | ||
log.Fatalf("initNodeDrainPubSub pubNodeDrainRequest err:%v", err) | ||
return | ||
} | ||
ctx.subNodeDrainStatus = subNodeDrainStatus | ||
ctx.pubNodeDrainRequest = pubNodeDrainRequest | ||
} | ||
|
||
// shouldDeferForNodeDrain will return true if this BaseOsStatus update will be handled later | ||
func shouldDeferForNodeDrain(ctx *baseOsMgrContext, id string, config *types.BaseOsConfig, status *types.BaseOsStatus) bool { | ||
drainStatus := kubeapi.GetNodeDrainStatus(ctx.subNodeDrainStatus, log) | ||
if drainStatus.Status == kubeapi.UNKNOWN { | ||
log.Error("shouldDeferForNodeDrain EARLY boot request, zedkube not up yet") | ||
return false | ||
} | ||
|
||
log.Noticef("shouldDeferForNodeDrain drainCheck id:%s state:%d baseOsConfig:%v baseOsStatus:%v drainStatus:%d", | ||
id, status.State, config, status, drainStatus.Status) | ||
if drainStatus.Status == kubeapi.NOTREQUESTED { | ||
ctx.deferredBaseOsID = id | ||
log.Noticef("shouldDeferForNodeDrain nodedrain-step:request requester:eve-os-update ctx:%s", id) | ||
err := kubeapi.RequestNodeDrain(ctx.pubNodeDrainRequest, kubeapi.UPDATE, id) | ||
if err != nil { | ||
log.Errorf("shouldDeferForNodeDrain: can't request node drain: %v", err) | ||
} | ||
return true | ||
} | ||
if drainStatus.Status == kubeapi.REQUESTED || | ||
drainStatus.Status == kubeapi.STARTING || | ||
drainStatus.Status == kubeapi.CORDONED || | ||
drainStatus.Status == kubeapi.FAILEDCORDON || | ||
drainStatus.Status == kubeapi.DRAINRETRYING || | ||
drainStatus.Status == kubeapi.FAILEDDRAIN { | ||
log.Functionf("shouldDeferForNodeDrain drain in-progress or in error, still defer") | ||
return true | ||
} | ||
|
||
if drainStatus.Status != kubeapi.COMPLETE { | ||
log.Errorf("shouldDeferForNodeDrain unhanded NodeDrainStatus:%v", drainStatus) | ||
} | ||
log.Noticef("shouldDeferForNodeDrain nodedrain-step:handle-complete requester:eve-os-update ctx:%s", id) | ||
return false | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
package diag | ||
|
||
import ( | ||
"time" | ||
|
||
"github.com/lf-edge/eve/pkg/pillar/kubeapi" | ||
"github.com/lf-edge/eve/pkg/pillar/pubsub" | ||
) | ||
|
||
func initDrainSub(ps *pubsub.PubSub, ctx *diagContext) { | ||
subNodeDrainStatus, err := ps.NewSubscription(pubsub.SubscriptionOptions{ | ||
AgentName: "zedkube", | ||
MyAgentName: agentName, | ||
TopicImpl: kubeapi.NodeDrainStatus{}, | ||
Persistent: false, | ||
Activate: true, | ||
Ctx: ctx, | ||
CreateHandler: handleNodeDrainStatusCreate, | ||
ModifyHandler: handleNodeDrainStatusModify, | ||
DeleteHandler: handleNodeDrainStatusDelete, | ||
WarningTime: warningTime, | ||
ErrorTime: errorTime, | ||
}) | ||
if err != nil { | ||
log.Fatal(err) | ||
} | ||
ctx.subNodeDrainStatus = subNodeDrainStatus | ||
ctx.subNodeDrainStatus.Activate() | ||
} | ||
|
||
func handleNodeDrainStatusCreate(ctxArg interface{}, key string, | ||
configArg interface{}) { | ||
handleNodeDrainStatusImpl(ctxArg, key, configArg, nil) | ||
} | ||
|
||
func handleNodeDrainStatusModify(ctxArg interface{}, key string, | ||
configArg interface{}, oldConfigArg interface{}) { | ||
handleNodeDrainStatusImpl(ctxArg, key, configArg, oldConfigArg) | ||
} | ||
|
||
func handleNodeDrainStatusImpl(ctxArg interface{}, key string, | ||
configArg interface{}, oldConfigArg interface{}) { | ||
ctx := ctxArg.(*diagContext) | ||
newStatus := configArg.(kubeapi.NodeDrainStatus) | ||
printNodeDrainStatus(ctx, newStatus) | ||
} | ||
|
||
func printNodeDrainStatus(ctx *diagContext, newStatus kubeapi.NodeDrainStatus) { | ||
ts := time.Now().Format(time.RFC3339Nano) | ||
if newStatus.Status < kubeapi.REQUESTED { | ||
// Just print the transitions which are linked to lengthy operations or errors | ||
return | ||
} | ||
ctx.ph.Print("INFO: Node Drain -> %s at %v\n", newStatus.Status.String(), ts) | ||
ctx.ph.Flush() | ||
} | ||
|
||
func handleNodeDrainStatusDelete(ctxArg interface{}, key string, | ||
statusArg interface{}) { | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
package nodeagent | ||
|
||
import ( | ||
"github.com/lf-edge/eve/pkg/pillar/kubeapi" | ||
"github.com/lf-edge/eve/pkg/pillar/pubsub" | ||
) | ||
|
||
func handleNodeDrainStatusCreateNA(ctxArg interface{}, key string, | ||
configArg interface{}) { | ||
handleNodeDrainStatusImplNA(ctxArg, key, configArg, nil) | ||
} | ||
|
||
func handleNodeDrainStatusModifyNA(ctxArg interface{}, key string, | ||
configArg interface{}, oldConfigArg interface{}) { | ||
handleNodeDrainStatusImplNA(ctxArg, key, configArg, oldConfigArg) | ||
} | ||
|
||
func handleNodeDrainStatusImplNA(ctxArg interface{}, _ string, | ||
configArg interface{}, _ interface{}) { | ||
ctx, ok := ctxArg.(*nodeagentContext) | ||
if !ok { | ||
log.Fatalf("handleNodeDrainStatusImplNA invalid type in ctxArg:%v", ctxArg) | ||
} | ||
newStatus, ok := configArg.(kubeapi.NodeDrainStatus) | ||
if !ok { | ||
log.Fatalf("handleNodeDrainStatusImplNA invalid type in configArg:%v", configArg) | ||
} | ||
|
||
if newStatus.RequestedBy != kubeapi.DEVICEOP { | ||
return | ||
} | ||
|
||
log.Noticef("handleNodeDrainStatusImplNA to:%v", newStatus) | ||
// NodeDrainStatus Failures here should keep drainInProgress set. | ||
// As this will set DrainInProgress on NodeAgentStatus and keep zedagent from allowing | ||
// the deferred operation to continue. | ||
if (newStatus.Status >= kubeapi.REQUESTED) && (newStatus.Status < kubeapi.COMPLETE) { | ||
log.Noticef("handleNodeDrainStatusImplNA nodedrain-step:drain-inprogress-handler NodeDrainStatus:%v", newStatus) | ||
ctx.drainInProgress = true | ||
publishNodeAgentStatus(ctx) | ||
} | ||
if newStatus.Status == kubeapi.COMPLETE { | ||
log.Notice("handleNodeDrainStatusImplNA nodedrain-step:drain-complete-handler notify zedagent") | ||
ctx.drainInProgress = false | ||
publishNodeAgentStatus(ctx) | ||
} | ||
} | ||
|
||
func handleNodeDrainStatusDeleteNA(_ interface{}, _ string, | ||
_ interface{}) { | ||
log.Functionf("handleNodeDrainStatusDeleteNA") | ||
} | ||
|
||
func initNodeDrainPubSub(ps *pubsub.PubSub, ctx *nodeagentContext) { | ||
subNodeDrainStatus, err := ps.NewSubscription(pubsub.SubscriptionOptions{ | ||
AgentName: "zedkube", | ||
MyAgentName: agentName, | ||
TopicImpl: kubeapi.NodeDrainStatus{}, | ||
Persistent: false, | ||
Activate: false, | ||
Ctx: ctx, | ||
CreateHandler: handleNodeDrainStatusCreateNA, | ||
ModifyHandler: handleNodeDrainStatusModifyNA, | ||
DeleteHandler: handleNodeDrainStatusDeleteNA, | ||
WarningTime: warningTime, | ||
ErrorTime: errorTime, | ||
}) | ||
if err != nil { | ||
log.Fatalf("initNodeDrainPubSub subNodeDrainStatus err:%v", err) | ||
return | ||
} | ||
if err := subNodeDrainStatus.Activate(); err != nil { | ||
log.Fatalf("initNodeDrainPubSub activate err:%v", err) | ||
} | ||
ctx.subNodeDrainStatus = subNodeDrainStatus | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.