From 27d9ca67b2b57ca44b8da1ec32d725305c92c29d Mon Sep 17 00:00:00 2001 From: Sanjay Srikakulam Date: Tue, 11 Feb 2025 16:16:30 +0100 Subject: [PATCH 1/5] Add config options and celery periodic task for failed jobs jwd cleanup task --- doc/source/admin/galaxy_options.rst | 34 +++++++++++++++++++++ lib/galaxy/celery/__init__.py | 3 ++ lib/galaxy/config/sample/galaxy.yml.sample | 25 +++++++++++---- lib/galaxy/config/schemas/config_schema.yml | 21 +++++++++++++ 4 files changed, 77 insertions(+), 6 deletions(-) diff --git a/doc/source/admin/galaxy_options.rst b/doc/source/admin/galaxy_options.rst index 9caac0e4f7fc..96d13a8de754 100644 --- a/doc/source/admin/galaxy_options.rst +++ b/doc/source/admin/galaxy_options.rst @@ -5747,4 +5747,38 @@ :Type: int +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +``enable_failed_jobs_working_directory_cleanup`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:Description: + Enables the cleanup of failed Galaxy job's working directories. + Runs in a Celery task. +:Default: ``false`` +:Type: bool + + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +``failed_jobs_working_directory_cleanup_days`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:Description: + The number of days to keep failed Galaxy job's working directories + before attempting to delete them. Runs in a Celery task. +:Default: ``5`` +:Type: int + + +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +``failed_jobs_working_directory_cleanup_interval`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:Description: + The interval in seconds between attempts to delete all failed + Galaxy job's working directories from the filesystem (every 24 + hours by default). Runs in a Celery task. +:Default: ``86400`` +:Type: int + + diff --git a/lib/galaxy/celery/__init__.py b/lib/galaxy/celery/__init__.py index 120df3c28906..7b313d55f3e9 100644 --- a/lib/galaxy/celery/__init__.py +++ b/lib/galaxy/celery/__init__.py @@ -246,6 +246,9 @@ def schedule_task(task, interval): if config.object_store_cache_monitor_driver in ["auto", "celery"]: schedule_task("clean_object_store_caches", config.object_store_cache_monitor_interval) + if config.enable_failed_jobs_working_directory_cleanup: + schedule_task("cleanup_jwds", config.failed_jobs_working_directory_cleanup_interval) + if beat_schedule: celery_app.conf.beat_schedule = beat_schedule diff --git a/lib/galaxy/config/sample/galaxy.yml.sample b/lib/galaxy/config/sample/galaxy.yml.sample index 6035d7266b35..17db9880e0d9 100644 --- a/lib/galaxy/config/sample/galaxy.yml.sample +++ b/lib/galaxy/config/sample/galaxy.yml.sample @@ -1,21 +1,21 @@ # Galaxy is configured by default to be usable in a single-user development # environment. To tune the application for a multi-user production # environment, see the documentation at: -# +# # https://docs.galaxyproject.org/en/master/admin/production.html -# +# # Throughout this sample configuration file, except where stated otherwise, # uncommented values override the default if left unset, whereas commented # values are set to the default value. Relative paths are relative to the root # Galaxy directory. -# +# # Examples of many of these options are explained in more detail in the Galaxy # Community Hub. -# +# # https://galaxyproject.org/admin/config -# +# # Config hackers are encouraged to check there before asking for help. -# +# # Configuration for Gravity process manager. # ``uwsgi:`` section will be ignored if Galaxy is started via Gravity commands (e.g ``./run.sh``, ``galaxy`` or ``galaxyctl``). gravity: @@ -3067,3 +3067,16 @@ galaxy: # affects s3fs file sources. #file_source_listings_expiry_time: 60 + # Enables the cleanup of failed Galaxy job's working directories. Runs + # in a Celery task. + #enable_failed_jobs_working_directory_cleanup: false + + # The number of days to keep failed Galaxy job's working directories + # before attempting to delete them. Runs in a Celery task. + #failed_jobs_working_directory_cleanup_days: 5 + + # The interval in seconds between attempts to delete all failed Galaxy + # job's working directories from the filesystem (every 24 hours by + # default). Runs in a Celery task. + #failed_jobs_working_directory_cleanup_interval: 86400 + diff --git a/lib/galaxy/config/schemas/config_schema.yml b/lib/galaxy/config/schemas/config_schema.yml index d43c647fa571..76f23327de26 100644 --- a/lib/galaxy/config/schemas/config_schema.yml +++ b/lib/galaxy/config/schemas/config_schema.yml @@ -4240,3 +4240,24 @@ mapping: Number of seconds before file source content listings are refreshed. Shorter times will result in more queries while browsing a file sources. Longer times will result in fewer requests to file sources but outdated contents might be displayed to the user. Currently only affects s3fs file sources. + + enable_failed_jobs_working_directory_cleanup: + type: bool + default: false + required: false + desc: | + Enables the cleanup of failed Galaxy job's working directories. Runs in a Celery task. + + failed_jobs_working_directory_cleanup_days: + type: int + required: false + default: 5 + desc: | + The number of days to keep failed Galaxy job's working directories before attempting to delete them. Runs in a Celery task. + + failed_jobs_working_directory_cleanup_interval: + type: int + required: false + default: 86400 + desc: | + The interval in seconds between attempts to delete all failed Galaxy job's working directories from the filesystem (every 24 hours by default). Runs in a Celery task. From 7ea32e0cdaaffcfa3740b41189f2b65027ea3102 Mon Sep 17 00:00:00 2001 From: Sanjay Srikakulam Date: Wed, 12 Feb 2025 09:54:07 +0100 Subject: [PATCH 2/5] update the cleanup_jwds function to accept the entire config and then use the days config option defined there. --- lib/galaxy/celery/tasks.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/galaxy/celery/tasks.py b/lib/galaxy/celery/tasks.py index 9265e2815e38..375d855848c5 100644 --- a/lib/galaxy/celery/tasks.py +++ b/lib/galaxy/celery/tasks.py @@ -508,7 +508,7 @@ def dispatch_pending_notifications(notification_manager: NotificationManager): @galaxy_task(action="clean up job working directories") -def cleanup_jwds(sa_session: galaxy_scoped_session, object_store: BaseObjectStore, days: int = 5): +def cleanup_jwds(sa_session: galaxy_scoped_session, object_store: BaseObjectStore, config: GalaxyAppConfiguration): """Cleanup job working directories for failed jobs that are older than X days""" def get_failed_jobs(): @@ -530,6 +530,7 @@ def delete_jwd(job): log.error(f"Error deleting job working directory: {path} : {e.strerror}") failed_jobs = get_failed_jobs() + days = config.failed_jobs_working_directory_cleanup_days if not failed_jobs: log.info("No failed jobs found within the last %s days", days) From 511cacb1f505dd52a5cb7cfa0b4d428196fda2e0 Mon Sep 17 00:00:00 2001 From: Sanjay Kumar Srikakulam Date: Wed, 12 Feb 2025 16:13:36 +0100 Subject: [PATCH 3/5] Update lib/galaxy/config/schemas/config_schema.yml Co-authored-by: John Chilton --- lib/galaxy/config/schemas/config_schema.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/galaxy/config/schemas/config_schema.yml b/lib/galaxy/config/schemas/config_schema.yml index 76f23327de26..5f172ead05ae 100644 --- a/lib/galaxy/config/schemas/config_schema.yml +++ b/lib/galaxy/config/schemas/config_schema.yml @@ -4253,7 +4253,7 @@ mapping: required: false default: 5 desc: | - The number of days to keep failed Galaxy job's working directories before attempting to delete them. Runs in a Celery task. + The number of days to keep failed Galaxy job's working directories before attempting to delete them if enable_failed_jobs_working_directory_cleanup is ``true``. Runs in a Celery task. failed_jobs_working_directory_cleanup_interval: type: int From 09123f5015c828e5c86b055a9b4a24fd762bb012 Mon Sep 17 00:00:00 2001 From: Sanjay Kumar Srikakulam Date: Wed, 12 Feb 2025 16:13:46 +0100 Subject: [PATCH 4/5] Update lib/galaxy/config/schemas/config_schema.yml Co-authored-by: John Chilton --- lib/galaxy/config/schemas/config_schema.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/galaxy/config/schemas/config_schema.yml b/lib/galaxy/config/schemas/config_schema.yml index 5f172ead05ae..4b0c11ec2bd1 100644 --- a/lib/galaxy/config/schemas/config_schema.yml +++ b/lib/galaxy/config/schemas/config_schema.yml @@ -4260,4 +4260,4 @@ mapping: required: false default: 86400 desc: | - The interval in seconds between attempts to delete all failed Galaxy job's working directories from the filesystem (every 24 hours by default). Runs in a Celery task. + The interval in seconds between attempts to delete all failed Galaxy job's working directories from the filesystem (every 24 hours by default) if enable_failed_jobs_working_directory_cleanup is ``true``. Runs in a Celery task. From e1e962ffe89b5953a6e68adf8c6944918665d073 Mon Sep 17 00:00:00 2001 From: Sanjay Srikakulam Date: Wed, 12 Feb 2025 16:20:47 +0100 Subject: [PATCH 5/5] add rebuilt config files --- doc/source/admin/galaxy_options.rst | 7 +++++-- lib/galaxy/config/sample/galaxy.yml.sample | 7 +++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/doc/source/admin/galaxy_options.rst b/doc/source/admin/galaxy_options.rst index 96d13a8de754..9e27373fbc73 100644 --- a/doc/source/admin/galaxy_options.rst +++ b/doc/source/admin/galaxy_options.rst @@ -5764,7 +5764,9 @@ :Description: The number of days to keep failed Galaxy job's working directories - before attempting to delete them. Runs in a Celery task. + before attempting to delete them if + enable_failed_jobs_working_directory_cleanup is ``true``. Runs in + a Celery task. :Default: ``5`` :Type: int @@ -5776,7 +5778,8 @@ :Description: The interval in seconds between attempts to delete all failed Galaxy job's working directories from the filesystem (every 24 - hours by default). Runs in a Celery task. + hours by default) if enable_failed_jobs_working_directory_cleanup + is ``true``. Runs in a Celery task. :Default: ``86400`` :Type: int diff --git a/lib/galaxy/config/sample/galaxy.yml.sample b/lib/galaxy/config/sample/galaxy.yml.sample index 17db9880e0d9..f0a082ae5950 100644 --- a/lib/galaxy/config/sample/galaxy.yml.sample +++ b/lib/galaxy/config/sample/galaxy.yml.sample @@ -3072,11 +3072,14 @@ galaxy: #enable_failed_jobs_working_directory_cleanup: false # The number of days to keep failed Galaxy job's working directories - # before attempting to delete them. Runs in a Celery task. + # before attempting to delete them if + # enable_failed_jobs_working_directory_cleanup is ``true``. Runs in a + # Celery task. #failed_jobs_working_directory_cleanup_days: 5 # The interval in seconds between attempts to delete all failed Galaxy # job's working directories from the filesystem (every 24 hours by - # default). Runs in a Celery task. + # default) if enable_failed_jobs_working_directory_cleanup is + # ``true``. Runs in a Celery task. #failed_jobs_working_directory_cleanup_interval: 86400