Skip to content

Commit

Permalink
Merge pull request #6565 from grondo/issue#6555
Browse files Browse the repository at this point in the history
ensure only jobs that entered the RUN state can satisfy `--dependency=afterany|afternotok`
  • Loading branch information
mergify[bot] authored Jan 19, 2025
2 parents eeaec43 + 15fddab commit 8a18be7
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 10 deletions.
7 changes: 7 additions & 0 deletions doc/man1/common/job-dependencies.rst
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,13 @@ The following dependency schemes are built-in:
has been submitted, then the submission will be rejected with an error
that the target job cannot be found.

.. note::
The ``after*`` dependency schemes below only satisfy dependencies for
jobs that entered the RUN state. A job that is canceled while pending
does not satisfy the `afterany` or `afternotok` dependencies. Thus,
canceling a job with a chain of dependencies causes all jobs in the
chain to be canceled.

after:JOBID
This dependency is satisfied after JOBID starts.

Expand Down
34 changes: 27 additions & 7 deletions src/modules/job-manager/plugins/dependency-after.c
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,15 @@ static int dependency_handle_inactive (flux_plugin_t *p,
"dependency: failed to get %ss result",
jobid);

/* If the target job did not enter RUN state, immediately raise an
* exception since after* dependencies only apply to jobs that actually
* ran:
*/
if (!flux_jobtap_job_event_posted (p, afterid, "alloc"))
return flux_jobtap_reject_job (p,
args,
"dependency: after: %s never started",
jobid);
if (type == AFTER_START
&& !flux_jobtap_job_event_posted (p, afterid, "start"))
return flux_jobtap_reject_job (p,
Expand Down Expand Up @@ -475,18 +484,20 @@ static void release_all (flux_plugin_t *p, zlistx_t *l, int typemask)
/*
* Raise exceptions for all unhandled depednencies in list `l`.
*/
static void raise_exceptions (flux_plugin_t *p, zlistx_t *l)
static void raise_exceptions (flux_plugin_t *p, zlistx_t *l, const char *msg)
{
if (l) {
struct after_info *after;
if (!msg)
msg = "can never be satisfied";
FOREACH_ZLISTX (l, after) {
if (flux_jobtap_raise_exception (p,
after->depid,
"dependency",
0,
"%s %s can never be satisfied",
"dependency",
after->description) < 0)
"dependency %s %s",
after->description,
msg) < 0)
flux_log_error (flux_jobtap_get_flux (p),
"id=%s: unable to raise exception for %s",
idf58 (after->depid),
Expand Down Expand Up @@ -547,8 +558,17 @@ static int release_dependent_jobs (flux_plugin_t *p, zlistx_t *l)
return -1;
}

/* Release dependent jobs based on requisite job result.
* Entries will be removed from the list as they are processed.
/* If this job never entered RUN state (i.e. got an exception before
* the alloc event), then none of the after* dependencies can be
* satisfied. Immediately raise exception(s).
*/
if (!flux_jobtap_job_event_posted (p, FLUX_JOBTAP_CURRENT_JOB, "alloc")) {
raise_exceptions (p, l, "job never started");
return 0;
}

/* O/w, release dependent jobs based on requisite job result.
* Entries will be removed from the list as they are processed.
*/
if (result != FLUX_JOB_RESULT_COMPLETED)
release_all (p, l, AFTER_FINISH | AFTER_FAILURE);
Expand All @@ -558,7 +578,7 @@ static int release_dependent_jobs (flux_plugin_t *p, zlistx_t *l)
/* Any remaining dependencies can't now be satisfied.
* Raise exceptions on any remaining members of list `l`
*/
raise_exceptions (p, l);
raise_exceptions (p, l, "can never be satisfied");

return 0;
}
Expand Down
35 changes: 32 additions & 3 deletions t/t2271-job-dependency-after.t
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ test_expect_success 'dependency=after works' '
test_debug "echo checking that job ${depid} is in DEPEND state" &&
test "$(flux jobs -no {state} $depid)" = "DEPEND" &&
flux job urgency $jobid default &&
flux job wait-event -vt 15 $depid clean
flux job wait-event -vt 15 $depid clean
'
test_expect_success 'dependency=after does not release job until start event' '
jobid=$(flux submit \
Expand Down Expand Up @@ -96,7 +96,7 @@ test_expect_success 'dependency=after generates exception for failed job' '
test_debug "echo checking that job ${depid} is in DEPEND state" &&
test "$(flux jobs -no {state} $depid)" = "DEPEND" &&
flux cancel $jobid &&
flux job wait-event -m type=dependency -vt 15 $depid exception
flux job wait-event -m type=dependency -vt 15 $depid exception
'
test_expect_success 'dependency=afterany works' '
flux bulksubmit \
Expand Down Expand Up @@ -186,6 +186,32 @@ test_expect_success 'dependency=afternotok works' '
flux job wait-event -vt 15 \
-m type=dependency $ok1 exception
'
test_expect_success 'dependency=afternotok only applies to jobs that start' '
canceled_job=$(flux submit --urgency=hold false) &&
id2=$(flux submit --dependency=afternotok:$canceled_job hostname) &&
flux cancel $canceled_job &&
flux job wait-event -vt 15 -m type=dependency $id2 exception
'
test_expect_success 'chain of afternotok jobs are canceled if one fails before start' '
id=$(flux submit --urgency=hold false) &&
id2=$(flux submit --dependency=afternotok:$id hostname) &&
id3=$(flux submit --dependency=afternotok:$id2 hostname) &&
id4=$(flux submit --dependency=afternotok:$id3 hostname) &&
flux cancel $id &&
for i in $id2 $id3 $id4; do
flux job wait-event -vt 15 -m type=dependency $id2 exception
done
'
test_expect_success 'chain of afternotok jobs are canceled if one succeeds' '
id=$(flux submit --urgency=hold true) &&
id2=$(flux submit --dependency=afternotok:$id hostname) &&
id3=$(flux submit --dependency=afternotok:$id2 hostname) &&
id4=$(flux submit --dependency=afternotok:$id3 hostname) &&
flux job urgency $id default &&
for i in $id2 $id3 $id4; do
flux job wait-event -vt 15 -m type=dependency $id2 exception
done
'
test_expect_success 'dependency=after works for INACTIVE jobs' '
run_timeout 15 \
flux bulksubmit --wait --watch \
Expand All @@ -205,7 +231,7 @@ test_expect_success 'dependency=afterok works for INACTIVE job' '
flux run --dependency=afterok:${job1} \
echo afterok:${job1} &&
test_must_fail flux run --dependency=afterok:${job2} hostname &&
test_must_fail flux run --dependency=afterok:${job3} hostname
test_must_fail flux run --dependency=afterok:${job3} hostname
'
test_expect_success 'dependency=afternotok works for INACTIVE job' '
run_timeout 15 \
Expand All @@ -215,6 +241,9 @@ test_expect_success 'dependency=afternotok works for INACTIVE job' '
echo afterany:{} ::: ${job2} ${job3} &&
test_must_fail flux run --dependency=afternotok:${job1} hostname
'
test_expect_success 'afternotok fails for INACTIVE job with no start event' '
test_must_fail flux run --dependency=afternotok:${canceled_job} hostname
'
test_expect_success 'dependency=after fails for INACTIVE canceled job' '
job4=$(flux submit --urgency=hold hostname) &&
flux cancel ${job4} &&
Expand Down

0 comments on commit 8a18be7

Please sign in to comment.