From 358bac0843357035e31091fbe1d7402caad66c9f Mon Sep 17 00:00:00 2001 From: Valay Dave Date: Fri, 28 Feb 2025 12:15:58 -0800 Subject: [PATCH] [jobsets] Fix bug in jobset atexit on local scheduler - Local scheduler was deleting succesfully completed jobsets. We avoid this now by ensuring that jobsets are killed or deleted when they are not running or waiting (in suspended state). --- metaflow/plugins/kubernetes/kubernetes_jobsets.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/metaflow/plugins/kubernetes/kubernetes_jobsets.py b/metaflow/plugins/kubernetes/kubernetes_jobsets.py index 49c75ff4163..4ffee838b07 100644 --- a/metaflow/plugins/kubernetes/kubernetes_jobsets.py +++ b/metaflow/plugins/kubernetes/kubernetes_jobsets.py @@ -319,6 +319,8 @@ def _fetch_pod(self): def kill(self): plural = "jobsets" client = self._client.get() + if not (self.is_running or self.is_waiting): + return try: # Killing the control pod will trigger the jobset to mark everything as failed. # Since jobsets have a successPolicy set to `All` which ensures that everything has