Skip to content

Commit

Permalink
fix: wait for infra machine info to be collected before powering off
Browse files Browse the repository at this point in the history
There were cases when an infra machine was accepted, it was powered off by the infra provider too quickly, before its specs were populated by the `MachineStatus` poller. This caused additional issues as some other resources like `SchematicConfiguration` were also never created, blocking cluster creation.

Address this by setting the preferred power state of the infra machine to ON until its status is populated (we check this by checking the secure boot status field).

Additionally, clean up `InfraMachineConfig` resources (user-managed) when a machine is deleted.

Signed-off-by: Utku Ozdemir <[email protected]>
  • Loading branch information
utkuozdemir committed Jan 8, 2025
1 parent 1c4f9af commit 728897a
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 6 deletions.
24 changes: 21 additions & 3 deletions internal/backend/runtime/omni/controllers/omni/infra_machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,9 @@ func NewInfraMachineController() *InfraMachineController {
qtransform.WithExtraMappedInput(
qtransform.MapperSameID[*omni.ClusterMachine, *siderolink.Link](),
),
qtransform.WithExtraMappedInput(
qtransform.MapperSameID[*omni.MachineStatus, *siderolink.Link](),
),
qtransform.WithExtraMappedInput(
func(ctx context.Context, _ *zap.Logger, runtime controller.QRuntime, res *infra.ProviderStatus) ([]resource.Pointer, error) {
linkList, err := safe.ReaderListAll[*siderolink.Link](ctx, runtime, state.WithLabelQuery(resource.LabelEqual(omni.LabelInfraProviderID, res.Metadata().ID())))
Expand Down Expand Up @@ -94,6 +97,11 @@ func (h *infraMachineControllerHelper) transformExtraOutput(ctx context.Context,
return err
}

machineStatus, err := helpers.HandleInput[*omni.MachineStatus](ctx, r, InfraMachineControllerName, link)
if err != nil {
return err
}

providerID, ok := link.Metadata().Annotations().Get(omni.LabelInfraProviderID)
if !ok {
return xerrors.NewTaggedf[qtransform.SkipReconcileTag]("the link is not created by an infra provider")
Expand All @@ -108,7 +116,9 @@ func (h *infraMachineControllerHelper) transformExtraOutput(ctx context.Context,
return xerrors.NewTaggedf[qtransform.SkipReconcileTag]("the link is not created by a static infra provider")
}

if err = h.applyInfraMachineConfig(infraMachine, config); err != nil {
machineInfoCollected := machineStatus != nil && machineStatus.TypedSpec().Value.SecureBootStatus != nil

if err = h.applyInfraMachineConfig(infraMachine, config, machineInfoCollected); err != nil {
return err
}

Expand Down Expand Up @@ -168,13 +178,17 @@ func (h *infraMachineControllerHelper) finalizerRemovalExtraOutput(ctx context.C
return err
}

_, err := helpers.HandleInput[*omni.ClusterMachine](ctx, r, InfraMachineControllerName, link)
if _, err := helpers.HandleInput[*omni.ClusterMachine](ctx, r, InfraMachineControllerName, link); err != nil {
return err
}

_, err := helpers.HandleInput[*omni.MachineStatus](ctx, r, InfraMachineControllerName, link)

return err
}

// applyInfraMachineConfig applies the user-managed configuration from the omni.InfraMachineConfig resource into the infra.Machine.
func (h *infraMachineControllerHelper) applyInfraMachineConfig(infraMachine *infra.Machine, config *omni.InfraMachineConfig) error {
func (h *infraMachineControllerHelper) applyInfraMachineConfig(infraMachine *infra.Machine, config *omni.InfraMachineConfig, machineInfoCollected bool) error {
const defaultPreferredPowerState = specs.InfraMachineSpec_POWER_STATE_OFF // todo: introduce a resource to configure this globally or per-provider level

// reset the user-override fields except the "Accepted" field
Expand Down Expand Up @@ -209,6 +223,10 @@ func (h *infraMachineControllerHelper) applyInfraMachineConfig(infraMachine *inf
infraMachine.Metadata().Labels().Delete(omni.LabelMachinePendingAccept)
}

if !machineInfoCollected { // we need the machine to stay powered on even if it is accepted, until Omni collects the machine information
infraMachine.TypedSpec().Value.PreferredPowerState = specs.InfraMachineSpec_POWER_STATE_ON
}

return nil
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,13 +47,26 @@ func (suite *InfraMachineControllerSuite) TestReconcile() {
assertion.True(ok)
assertion.Equal("bare-metal", infraProviderID)

assertion.Equal(specs.InfraMachineSpec_POWER_STATE_OFF, r.TypedSpec().Value.PreferredPowerState)
assertion.Equal(specs.InfraMachineSpec_POWER_STATE_ON, r.TypedSpec().Value.PreferredPowerState) // MachineStatus is not populated yet
assertion.Equal(specs.InfraMachineConfigSpec_PENDING, r.TypedSpec().Value.AcceptanceStatus)
assertion.Empty(r.TypedSpec().Value.ClusterTalosVersion)
assertion.Empty(r.TypedSpec().Value.Extensions)
assertion.Empty(r.TypedSpec().Value.WipeId)
})

machineStatus := omni.NewMachineStatus(resources.DefaultNamespace, "machine-1")
machineStatus.TypedSpec().Value.SecureBootStatus = &specs.SecureBootStatus{}

suite.Require().NoError(suite.state.Create(suite.ctx, machineStatus))

assertResource[*omni.MachineStatus](&suite.OmniSuite, machineStatus.Metadata(), func(r *omni.MachineStatus, assertion *assert.Assertions) {
assertion.True(r.Metadata().Finalizers().Has(omnictrl.InfraMachineControllerName))
})

assertResource[*infra.Machine](&suite.OmniSuite, infraMachineMD, func(r *infra.Machine, assertion *assert.Assertions) {
assertion.Equal(specs.InfraMachineSpec_POWER_STATE_OFF, r.TypedSpec().Value.PreferredPowerState) // expect the default state of "OFF"
})

// accept the machine, set its preferred power state to on
config := omni.NewInfraMachineConfig(resources.DefaultNamespace, "machine-1")
config.TypedSpec().Value.AcceptanceStatus = specs.InfraMachineConfigSpec_ACCEPTED
Expand Down Expand Up @@ -148,6 +161,10 @@ func (suite *InfraMachineControllerSuite) TestReconcile() {
assertion.False(r.Metadata().Finalizers().Has(omnictrl.InfraMachineControllerName))
})

assertResource[*omni.MachineStatus](&suite.OmniSuite, infraMachineMD, func(r *omni.MachineStatus, assertion *assert.Assertions) {
assertion.False(r.Metadata().Finalizers().Has(omnictrl.InfraMachineControllerName))
})

// assert that infra.Machine is removed
assertNoResource[*infra.Machine](&suite.OmniSuite, infraMachine)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,11 @@ type MachineCleanupController = cleanup.Controller[*omni.Machine]
func NewMachineCleanupController() *MachineCleanupController {
return cleanup.NewController(
cleanup.Settings[*omni.Machine]{
Name: "MachineCleanupController",
Handler: &helpers.SameIDHandler[*omni.Machine, *omni.MachineSetNode]{},
Name: "MachineCleanupController",
Handler: cleanup.Combine(
&helpers.SameIDHandler[*omni.Machine, *omni.MachineSetNode]{},
&helpers.SameIDHandler[*omni.Machine, *omni.InfraMachineConfig]{},
),
},
)
}

0 comments on commit 728897a

Please sign in to comment.