Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Retry with ForcePowerOff if graceful shutdown times out #129

Merged
merged 1 commit into from
Sep 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,15 @@ RUN --mount=type=cache,target=/root/.cache/go-build \
# Use distroless as minimal base image to package the manager binary
# Refer to https://github.com/GoogleContainerTools/distroless for more details
FROM gcr.io/distroless/static:nonroot AS manager
LABEL source_repository="https://github.com/ironcore-dev/metal-operator"
WORKDIR /
COPY --from=builder /workspace/manager .
USER 65532:65532

ENTRYPOINT ["/manager"]

FROM gcr.io/distroless/static:nonroot AS probe
LABEL source_repository="https://github.com/ironcore-dev/metal-operator"
WORKDIR /
COPY --from=builder /workspace/metalprobe .
USER 65532:65532
Expand Down
5 changes: 4 additions & 1 deletion bmc/bmc.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,12 @@ type BMC interface {
// PowerOn powers on the system.
PowerOn(systemUUID string) error

// PowerOff powers off the system.
// PowerOff gracefully shuts down the system.
PowerOff(systemUUID string) error

// ForcePowerOff powers off the system.
ForcePowerOff(systemUUID string) error

// Reset performs a reset on the system.
Reset(systemUUID string, resetType redfish.ResetType) error

Expand Down
14 changes: 13 additions & 1 deletion bmc/redfish.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ func (r *RedfishBMC) PowerOn(systemUUID string) error {
return nil
}

// PowerOff powers off the system using Redfish.
// PowerOff gracefully shuts down the system using Redfish.
func (r *RedfishBMC) PowerOff(systemUUID string) error {
system, err := r.getSystemByUUID(systemUUID)
if err != nil {
Expand All @@ -86,6 +86,18 @@ func (r *RedfishBMC) PowerOff(systemUUID string) error {
return nil
}

// ForcePowerOff powers off the system using Redfish.
func (r *RedfishBMC) ForcePowerOff(systemUUID string) error {
system, err := r.getSystemByUUID(systemUUID)
if err != nil {
return fmt.Errorf("failed to get systems: %w", err)
}
if err := system.Reset(redfish.ForceOffResetType); err != nil {
return fmt.Errorf("failed to reset system to power on state: %w", err)
}
return nil
}

// Reset performs a reset on the system using Redfish.
func (r *RedfishBMC) Reset(systemUUID string, resetType redfish.ResetType) error {
system, err := r.getSystemByUUID(systemUUID)
Expand Down
4 changes: 4 additions & 0 deletions cmd/manager/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ func main() {
var registryResyncInterval time.Duration
var webhookPort int
var enforceFirstBoot bool
var enforcePowerOff bool
var serverResyncInterval time.Duration
var powerPollingInterval time.Duration
var powerPollingTimeout time.Duration
Expand All @@ -82,6 +83,8 @@ func main() {
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
flag.BoolVar(&enforceFirstBoot, "enforce-first-boot", false,
"Enforce the first boot probing of a Server even if it is powered on in the Initial state.")
flag.BoolVar(&enforcePowerOff, "enforce-power-off", false,
"Enforce the power off of a Server when graceful shutdown fails.")
flag.IntVar(&webhookPort, "webhook-port", 9445, "The port to use for webhook server.")
flag.BoolVar(&enableLeaderElection, "leader-elect", false,
"Enable leader election for controller manager. "+
Expand Down Expand Up @@ -212,6 +215,7 @@ func main() {
RegistryResyncInterval: registryResyncInterval,
ResyncInterval: serverResyncInterval,
EnforceFirstBoot: enforceFirstBoot,
EnforcePowerOff: enforcePowerOff,
PowerPollingInterval: powerPollingInterval,
PowerPollingTimeout: powerPollingTimeout,
}).SetupWithManager(mgr); err != nil {
Expand Down
18 changes: 16 additions & 2 deletions internal/controller/server_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ type ServerReconciler struct {
ProbeOSImage string
RegistryResyncInterval time.Duration
EnforceFirstBoot bool
EnforcePowerOff bool
ResyncInterval time.Duration
PowerPollingInterval time.Duration
PowerPollingTimeout time.Duration
Expand Down Expand Up @@ -671,11 +672,24 @@ func (r *ServerReconciler) ensureServerPowerState(ctx context.Context, log logr.
return fmt.Errorf("failed to wait for server power on server: %w", err)
}
case powerOpOff:
if err := bmcClient.PowerOff(server.Spec.UUID); err != nil {
powerOffType := bmcClient.PowerOff

if err := powerOffType(server.Spec.UUID); err != nil {
return fmt.Errorf("failed to power off server: %w", err)
}
if err := r.waitForServerPowerState(ctx, log, bmcClient, server, redfish.OffPowerState); err != nil {
return fmt.Errorf("failed to wait for server power off server: %w", err)
if r.EnforcePowerOff {
log.V(1).Info("Failed to wait for server graceful shutdown, retrying with force power off")
powerOffType = bmcClient.ForcePowerOff
if err := powerOffType(server.Spec.UUID); err != nil {
return fmt.Errorf("failed to power off server: %w", err)
}
if err := r.waitForServerPowerState(ctx, log, bmcClient, server, redfish.OffPowerState); err != nil {
return fmt.Errorf("failed to wait for server force power off: %w", err)
}
} else {
return fmt.Errorf("failed to wait for server power off: %w", err)
}
}
}
log.V(1).Info("Ensured server power state", "PowerState", server.Spec.Power)
Expand Down
Loading