From 1114bd74c199b865f670111493438ac3451f93e2 Mon Sep 17 00:00:00 2001 From: Lucas Koehler Date: Fri, 24 Jan 2025 09:55:00 +0100 Subject: [PATCH] Stabilize service cleanup for eager start Removing the labels and owner reference fails for rare cases when cleaning an eager instance's service after a session was removed. This adds retries to stabilize the sessionDeleted handling. --- .../handler/session/EagerSessionHandler.java | 43 +++++++++++++------ 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/java/operator/org.eclipse.theia.cloud.operator/src/main/java/org/eclipse/theia/cloud/operator/handler/session/EagerSessionHandler.java b/java/operator/org.eclipse.theia.cloud.operator/src/main/java/org/eclipse/theia/cloud/operator/handler/session/EagerSessionHandler.java index a627f080..6dcb7cbd 100644 --- a/java/operator/org.eclipse.theia.cloud.operator/src/main/java/org/eclipse/theia/cloud/operator/handler/session/EagerSessionHandler.java +++ b/java/operator/org.eclipse.theia.cloud.operator/src/main/java/org/eclipse/theia/cloud/operator/handler/session/EagerSessionHandler.java @@ -351,19 +351,36 @@ public boolean sessionDeleted(Session session, String correlationId) { String serviceName = ownedService.getMetadata().getName(); // Remove owner reference and user specific labels from the service - Service cleanedService; - try { - cleanedService = client.services().withName(serviceName).edit(service -> { - TheiaCloudHandlerUtil.removeOwnerReferenceFromItem(correlationId, sessionResourceName, - sessionResourceUID, service); - service.getMetadata().getLabels().keySet().removeAll(LabelsUtil.getSessionSpecificLabelKeys()); - return service; - }); - LOGGER.info(formatLogMessage(correlationId, - "Removed owner reference and user-specific session labels from service: " + serviceName)); - } catch (KubernetesClientException e) { - LOGGER.error(formatLogMessage(correlationId, "Error while editing service " + serviceName), e); - return false; + // Allow retries because in rare cases the update fails. It is not clear why but might be caused by the owner + // reference being removed by Kubernetes garbage collection. + // The retries aim to stabilize the clean up process. + Service cleanedService = null; + int editServiceAttempts = 0; + boolean editServiceSuccess = false; + while (editServiceAttempts < 3 && !editServiceSuccess) { + try { + cleanedService = client.services().withName(serviceName).edit(service -> { + TheiaCloudHandlerUtil.removeOwnerReferenceFromItem(correlationId, sessionResourceName, + sessionResourceUID, service); + service.getMetadata().getLabels().keySet().removeAll(LabelsUtil.getSessionSpecificLabelKeys()); + return service; + }); + LOGGER.info(formatLogMessage(correlationId, + "Removed owner reference and user-specific session labels from service: " + serviceName)); + editServiceSuccess = true; + } catch (KubernetesClientException e) { + editServiceAttempts++; + if (editServiceAttempts < 3) { + LOGGER.warn( + formatLogMessage(correlationId, + "Attempt " + editServiceAttempts + " failed while editing service " + serviceName), + e); + } else { + LOGGER.error(formatLogMessage(correlationId, "Error while editing service " + serviceName + + " after " + editServiceAttempts + " attempts"), e); + return false; + } + } } Integer instance = TheiaCloudServiceUtil.getId(correlationId, appDefinition.get(), cleanedService);