diff --git a/Dockerfile b/Dockerfile index cba8e6d..89ec6c7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -34,6 +34,7 @@ RUN --mount=type=cache,target=/root/.cache/go-build \ # Use distroless as minimal base image to package the manager binary # Refer to https://github.com/GoogleContainerTools/distroless for more details FROM gcr.io/distroless/static:nonroot AS manager +LABEL source_repository="https://github.com/ironcore-dev/metal-operator" WORKDIR / COPY --from=builder /workspace/manager . USER 65532:65532 @@ -41,6 +42,7 @@ USER 65532:65532 ENTRYPOINT ["/manager"] FROM gcr.io/distroless/static:nonroot AS probe +LABEL source_repository="https://github.com/ironcore-dev/metal-operator" WORKDIR / COPY --from=builder /workspace/metalprobe . USER 65532:65532 diff --git a/bmc/bmc.go b/bmc/bmc.go index fa6c8e5..300092e 100644 --- a/bmc/bmc.go +++ b/bmc/bmc.go @@ -13,9 +13,12 @@ type BMC interface { // PowerOn powers on the system. PowerOn(systemUUID string) error - // PowerOff powers off the system. + // PowerOff gracefully shuts down the system. PowerOff(systemUUID string) error + // ForcePowerOff powers off the system. + ForcePowerOff(systemUUID string) error + // Reset performs a reset on the system. Reset(systemUUID string, resetType redfish.ResetType) error diff --git a/bmc/redfish.go b/bmc/redfish.go index cf10dca..e458e35 100644 --- a/bmc/redfish.go +++ b/bmc/redfish.go @@ -74,7 +74,7 @@ func (r *RedfishBMC) PowerOn(systemUUID string) error { return nil } -// PowerOff powers off the system using Redfish. +// PowerOff gracefully shuts down the system using Redfish. func (r *RedfishBMC) PowerOff(systemUUID string) error { system, err := r.getSystemByUUID(systemUUID) if err != nil { @@ -86,6 +86,18 @@ func (r *RedfishBMC) PowerOff(systemUUID string) error { return nil } +// ForcePowerOff powers off the system using Redfish. +func (r *RedfishBMC) ForcePowerOff(systemUUID string) error { + system, err := r.getSystemByUUID(systemUUID) + if err != nil { + return fmt.Errorf("failed to get systems: %w", err) + } + if err := system.Reset(redfish.ForceOffResetType); err != nil { + return fmt.Errorf("failed to reset system to power on state: %w", err) + } + return nil +} + // Reset performs a reset on the system using Redfish. func (r *RedfishBMC) Reset(systemUUID string, resetType redfish.ResetType) error { system, err := r.getSystemByUUID(systemUUID) diff --git a/cmd/manager/main.go b/cmd/manager/main.go index 78188e0..83d3182 100644 --- a/cmd/manager/main.go +++ b/cmd/manager/main.go @@ -59,6 +59,7 @@ func main() { var registryResyncInterval time.Duration var webhookPort int var enforceFirstBoot bool + var enforcePowerOff bool var serverResyncInterval time.Duration var powerPollingInterval time.Duration var powerPollingTimeout time.Duration @@ -82,6 +83,8 @@ func main() { flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") flag.BoolVar(&enforceFirstBoot, "enforce-first-boot", false, "Enforce the first boot probing of a Server even if it is powered on in the Initial state.") + flag.BoolVar(&enforcePowerOff, "enforce-power-off", false, + "Enforce the power off of a Server when graceful shutdown fails.") flag.IntVar(&webhookPort, "webhook-port", 9445, "The port to use for webhook server.") flag.BoolVar(&enableLeaderElection, "leader-elect", false, "Enable leader election for controller manager. "+ @@ -212,6 +215,7 @@ func main() { RegistryResyncInterval: registryResyncInterval, ResyncInterval: serverResyncInterval, EnforceFirstBoot: enforceFirstBoot, + EnforcePowerOff: enforcePowerOff, PowerPollingInterval: powerPollingInterval, PowerPollingTimeout: powerPollingTimeout, }).SetupWithManager(mgr); err != nil { diff --git a/internal/controller/server_controller.go b/internal/controller/server_controller.go index a9de71f..57b64db 100644 --- a/internal/controller/server_controller.go +++ b/internal/controller/server_controller.go @@ -61,6 +61,7 @@ type ServerReconciler struct { ProbeOSImage string RegistryResyncInterval time.Duration EnforceFirstBoot bool + EnforcePowerOff bool ResyncInterval time.Duration PowerPollingInterval time.Duration PowerPollingTimeout time.Duration @@ -671,11 +672,24 @@ func (r *ServerReconciler) ensureServerPowerState(ctx context.Context, log logr. return fmt.Errorf("failed to wait for server power on server: %w", err) } case powerOpOff: - if err := bmcClient.PowerOff(server.Spec.UUID); err != nil { + powerOffType := bmcClient.PowerOff + + if err := powerOffType(server.Spec.UUID); err != nil { return fmt.Errorf("failed to power off server: %w", err) } if err := r.waitForServerPowerState(ctx, log, bmcClient, server, redfish.OffPowerState); err != nil { - return fmt.Errorf("failed to wait for server power off server: %w", err) + if r.EnforcePowerOff { + log.V(1).Info("Failed to wait for server graceful shutdown, retrying with force power off") + powerOffType = bmcClient.ForcePowerOff + if err := powerOffType(server.Spec.UUID); err != nil { + return fmt.Errorf("failed to power off server: %w", err) + } + if err := r.waitForServerPowerState(ctx, log, bmcClient, server, redfish.OffPowerState); err != nil { + return fmt.Errorf("failed to wait for server force power off: %w", err) + } + } else { + return fmt.Errorf("failed to wait for server power off: %w", err) + } } } log.V(1).Info("Ensured server power state", "PowerState", server.Spec.Power)