From b00e5f4aafeb23563c77e50d3dc9abee9fa6310a Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Sun, 19 Jan 2025 17:28:10 +0000 Subject: [PATCH 1/5] dependency-after: allow alternate msg when raising exceptions Problem: The raise_exceptions() helper function uses a static message in the exception note, but callers may want to provide something more specific. Add a msg parameter to raise_exceptions(). --- src/modules/job-manager/plugins/dependency-after.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/modules/job-manager/plugins/dependency-after.c b/src/modules/job-manager/plugins/dependency-after.c index ce47d71c2f10..51c39bf6d049 100644 --- a/src/modules/job-manager/plugins/dependency-after.c +++ b/src/modules/job-manager/plugins/dependency-after.c @@ -475,18 +475,20 @@ static void release_all (flux_plugin_t *p, zlistx_t *l, int typemask) /* * Raise exceptions for all unhandled depednencies in list `l`. */ -static void raise_exceptions (flux_plugin_t *p, zlistx_t *l) +static void raise_exceptions (flux_plugin_t *p, zlistx_t *l, const char *msg) { if (l) { struct after_info *after; + if (!msg) + msg = "can never be satisfied"; FOREACH_ZLISTX (l, after) { if (flux_jobtap_raise_exception (p, after->depid, "dependency", 0, - "%s %s can never be satisfied", - "dependency", - after->description) < 0) + "dependency %s %s", + after->description, + msg) < 0) flux_log_error (flux_jobtap_get_flux (p), "id=%s: unable to raise exception for %s", idf58 (after->depid), @@ -558,7 +560,7 @@ static int release_dependent_jobs (flux_plugin_t *p, zlistx_t *l) /* Any remaining dependencies can't now be satisfied. * Raise exceptions on any remaining members of list `l` */ - raise_exceptions (p, l); + raise_exceptions (p, l, "can never be satisfied"); return 0; } From 8845d3c53febd298e63784ffa2ea945dd18708ed Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Sun, 19 Jan 2025 17:30:18 +0000 Subject: [PATCH 2/5] dependency-after: ensure only jobs that ran satisfy dependencies Problem: The dependency-after plugin allows jobs that were canceled before the RUN state to satisfy an `afternotok` dependency. This causes unexpected behavior when `afternotok` chaining is used to keep a job running until success: Once the job is successful, this raises an exception on the dependent job, which fails and releases the depedency on the next job in the chain, which unexpectedly runs. The principle of least surprise probably dictates that `after*` dependencies can only ever be satified for jobs that actually ran, and therefore jobs canceled before an `alloc` event should raise an exception on *any* dependent job. Update the dependency-after plugin so that depencies are never satified for jobs that did not enter the RUN state. Fixes #6555 --- .../job-manager/plugins/dependency-after.c | 22 +++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/src/modules/job-manager/plugins/dependency-after.c b/src/modules/job-manager/plugins/dependency-after.c index 51c39bf6d049..c2c60088c47a 100644 --- a/src/modules/job-manager/plugins/dependency-after.c +++ b/src/modules/job-manager/plugins/dependency-after.c @@ -252,6 +252,15 @@ static int dependency_handle_inactive (flux_plugin_t *p, "dependency: failed to get %ss result", jobid); + /* If the target job did not enter RUN state, immediately raise an + * exception since after* dependencies only apply to jobs that actually + * ran: + */ + if (!flux_jobtap_job_event_posted (p, afterid, "alloc")) + return flux_jobtap_reject_job (p, + args, + "dependency: after: %s never started", + jobid); if (type == AFTER_START && !flux_jobtap_job_event_posted (p, afterid, "start")) return flux_jobtap_reject_job (p, @@ -549,8 +558,17 @@ static int release_dependent_jobs (flux_plugin_t *p, zlistx_t *l) return -1; } - /* Release dependent jobs based on requisite job result. - * Entries will be removed from the list as they are processed. + /* If this job never entered RUN state (i.e. got an exception before + * the alloc event), then none of the after* dependencies can be + * satisfied. Immediately raise exception(s). + */ + if (!flux_jobtap_job_event_posted (p, FLUX_JOBTAP_CURRENT_JOB, "alloc")) { + raise_exceptions (p, l, "job never started"); + return 0; + } + + /* O/w, release dependent jobs based on requisite job result. + * Entries will be removed from the list as they are processed. */ if (result != FLUX_JOB_RESULT_COMPLETED) release_all (p, l, AFTER_FINISH | AFTER_FAILURE); From a2a5ee6f03fdfcb9ec84fc2be2ac3cf151b46726 Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Sun, 19 Jan 2025 19:25:31 +0000 Subject: [PATCH 3/5] testsuite: fix whitespace in t2271-job-dependency-after.t Problem: There's some trailing whitespace in t2271-job-dependency-after.t. Fix it. --- t/t2271-job-dependency-after.t | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/t/t2271-job-dependency-after.t b/t/t2271-job-dependency-after.t index 3cbd383e5d7e..c917cbc86632 100755 --- a/t/t2271-job-dependency-after.t +++ b/t/t2271-job-dependency-after.t @@ -63,7 +63,7 @@ test_expect_success 'dependency=after works' ' test_debug "echo checking that job ${depid} is in DEPEND state" && test "$(flux jobs -no {state} $depid)" = "DEPEND" && flux job urgency $jobid default && - flux job wait-event -vt 15 $depid clean + flux job wait-event -vt 15 $depid clean ' test_expect_success 'dependency=after does not release job until start event' ' jobid=$(flux submit \ @@ -96,7 +96,7 @@ test_expect_success 'dependency=after generates exception for failed job' ' test_debug "echo checking that job ${depid} is in DEPEND state" && test "$(flux jobs -no {state} $depid)" = "DEPEND" && flux cancel $jobid && - flux job wait-event -m type=dependency -vt 15 $depid exception + flux job wait-event -m type=dependency -vt 15 $depid exception ' test_expect_success 'dependency=afterany works' ' flux bulksubmit \ @@ -205,7 +205,7 @@ test_expect_success 'dependency=afterok works for INACTIVE job' ' flux run --dependency=afterok:${job1} \ echo afterok:${job1} && test_must_fail flux run --dependency=afterok:${job2} hostname && - test_must_fail flux run --dependency=afterok:${job3} hostname + test_must_fail flux run --dependency=afterok:${job3} hostname ' test_expect_success 'dependency=afternotok works for INACTIVE job' ' run_timeout 15 \ From 8ce2d8acac7ea1f3e04e9517cf2d6b2c3629f9a8 Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Sun, 19 Jan 2025 17:50:37 +0000 Subject: [PATCH 4/5] testsuite: ensure dependencies satisfied only for jobs that ran Problem: No tests in the testsuite ensure that dependencies are only satisfied by the dependency-after plugin when jobs actually ran. Add a couple tests to t2271-job-dependency-after.t. --- t/t2271-job-dependency-after.t | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/t/t2271-job-dependency-after.t b/t/t2271-job-dependency-after.t index c917cbc86632..02481bb09e0c 100755 --- a/t/t2271-job-dependency-after.t +++ b/t/t2271-job-dependency-after.t @@ -186,6 +186,32 @@ test_expect_success 'dependency=afternotok works' ' flux job wait-event -vt 15 \ -m type=dependency $ok1 exception ' +test_expect_success 'dependency=afternotok only applies to jobs that start' ' + canceled_job=$(flux submit --urgency=hold false) && + id2=$(flux submit --dependency=afternotok:$canceled_job hostname) && + flux cancel $canceled_job && + flux job wait-event -vt 15 -m type=dependency $id2 exception +' +test_expect_success 'chain of afternotok jobs are canceled if one fails before start' ' + id=$(flux submit --urgency=hold false) && + id2=$(flux submit --dependency=afternotok:$id hostname) && + id3=$(flux submit --dependency=afternotok:$id2 hostname) && + id4=$(flux submit --dependency=afternotok:$id3 hostname) && + flux cancel $id && + for i in $id2 $id3 $id4; do + flux job wait-event -vt 15 -m type=dependency $id2 exception + done +' +test_expect_success 'chain of afternotok jobs are canceled if one succeeds' ' + id=$(flux submit --urgency=hold true) && + id2=$(flux submit --dependency=afternotok:$id hostname) && + id3=$(flux submit --dependency=afternotok:$id2 hostname) && + id4=$(flux submit --dependency=afternotok:$id3 hostname) && + flux job urgency $id default && + for i in $id2 $id3 $id4; do + flux job wait-event -vt 15 -m type=dependency $id2 exception + done +' test_expect_success 'dependency=after works for INACTIVE jobs' ' run_timeout 15 \ flux bulksubmit --wait --watch \ @@ -215,6 +241,9 @@ test_expect_success 'dependency=afternotok works for INACTIVE job' ' echo afterany:{} ::: ${job2} ${job3} && test_must_fail flux run --dependency=afternotok:${job1} hostname ' +test_expect_success 'afternotok fails for INACTIVE job with no start event' ' + test_must_fail flux run --dependency=afternotok:${canceled_job} hostname +' test_expect_success 'dependency=after fails for INACTIVE canceled job' ' job4=$(flux submit --urgency=hold hostname) && flux cancel ${job4} && From 15fddabf1b49616f3c0ad4a25cfb37b5c088ddb3 Mon Sep 17 00:00:00 2001 From: "Mark A. Grondona" Date: Sun, 19 Jan 2025 17:58:28 +0000 Subject: [PATCH 5/5] doc: note caveat about RUN state in job-dependencies.rst Problem: The documentation of the `--dependency=` option for job submission commands does not make it clear that the `after*` dependencies only apply to jobs that entered the RUN state. Add a note that makes this explicit. --- doc/man1/common/job-dependencies.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/doc/man1/common/job-dependencies.rst b/doc/man1/common/job-dependencies.rst index 09007c68e3e9..2548e7fdb6b0 100644 --- a/doc/man1/common/job-dependencies.rst +++ b/doc/man1/common/job-dependencies.rst @@ -28,6 +28,13 @@ The following dependency schemes are built-in: has been submitted, then the submission will be rejected with an error that the target job cannot be found. +.. note:: + The ``after*`` dependency schemes below only satisfy dependencies for + jobs that entered the RUN state. A job that is canceled while pending + does not satisfy the `afterany` or `afternotok` dependencies. Thus, + canceling a job with a chain of dependencies causes all jobs in the + chain to be canceled. + after:JOBID This dependency is satisfied after JOBID starts.