|
|
678cae6 |
From 60cc55aa03583988751144a9f47024a110b4a9bb Mon Sep 17 00:00:00 2001
|
|
|
678cae6 |
From: Adam Williamson <awilliam@redhat.com>
|
|
|
678cae6 |
Date: Mon, 19 Dec 2022 10:52:28 -0800
|
|
|
678cae6 |
Subject: [PATCH] Don't restart scheduled or running chained parents
|
|
|
678cae6 |
|
|
|
678cae6 |
This is a follow-up to @martchus #4498. The intent of that PR,
|
|
|
678cae6 |
as the commit message says, is to "Restart failed chained parents
|
|
|
678cae6 |
"up the chain"". However, it actually does more than this: it
|
|
|
678cae6 |
also restarts chained parents that are queued or running, because
|
|
|
678cae6 |
they also do not have "OK" results. This tweaks the check a bit
|
|
|
678cae6 |
so we only restart parents if they have a result which is known
|
|
|
678cae6 |
to be not OK. This leaves out NONE, which is the result that
|
|
|
678cae6 |
scheduled or running tests have.
|
|
|
678cae6 |
|
|
|
678cae6 |
In a way we cannot know whether restarting a chained parent that
|
|
|
678cae6 |
is still running is the right thing to do, but doing so definitely
|
|
|
678cae6 |
causes problems in one real-world case:
|
|
|
678cae6 |
https://progress.opensuse.org/projects/openqav3/activity?from=2022-12-19
|
|
|
678cae6 |
and from the examples backing the original change here, it seems
|
|
|
678cae6 |
we only really intended to add the restart behaviour when the
|
|
|
678cae6 |
parent test definitely failed, not when its result was not yet
|
|
|
678cae6 |
decided. So on the principle of making minimal changes to the
|
|
|
678cae6 |
previous behaviour, it seems reasonable to make this change,
|
|
|
678cae6 |
which in fact restores the behaviour from before #4498 for
|
|
|
678cae6 |
still-running parent jobs. The cases #4498 intended to address
|
|
|
678cae6 |
with complex investigation jobs should still be addressed.
|
|
|
678cae6 |
|
|
|
678cae6 |
This also updates the comments for `duplicate` to more accurately
|
|
|
678cae6 |
reflect current behaviour, and highlight the problem that
|
|
|
678cae6 |
ignoring children of a parent job can ignore not only other jobs
|
|
|
678cae6 |
in the same parallel group as the child job, but other children of
|
|
|
678cae6 |
the same parent that are *not* in the same parallel group as the
|
|
|
678cae6 |
child job.
|
|
|
678cae6 |
|
|
|
678cae6 |
Related progress issue: https://progress.opensuse.org/issues/110458
|
|
|
678cae6 |
Related progress issue: https://progress.opensuse.org/issues/112256
|
|
|
678cae6 |
|
|
|
678cae6 |
Signed-off-by: Adam Williamson <awilliam@redhat.com>
|
|
|
678cae6 |
---
|
|
|
678cae6 |
lib/OpenQA/Schema/Result/Jobs.pm | 6 +++---
|
|
|
678cae6 |
t/05-scheduler-restart-and-duplicate.t | 5 +++--
|
|
|
678cae6 |
2 files changed, 6 insertions(+), 5 deletions(-)
|
|
|
678cae6 |
|
|
|
678cae6 |
diff --git a/lib/OpenQA/Schema/Result/Jobs.pm b/lib/OpenQA/Schema/Result/Jobs.pm
|
|
|
678cae6 |
index b49a60661..0084777ea 100644
|
|
|
678cae6 |
--- a/lib/OpenQA/Schema/Result/Jobs.pm
|
|
|
678cae6 |
+++ b/lib/OpenQA/Schema/Result/Jobs.pm
|
|
|
678cae6 |
@@ -745,7 +744,7 @@ sub cluster_jobs ($self, @args) {
|
|
|
678cae6 |
unless ($skip_parents || $cancelmode) {
|
|
|
678cae6 |
my $parent_result = $p->result;
|
|
|
678cae6 |
$p->cluster_jobs(jobs => $jobs, skip_children => 1, cancelmode => $cancelmode)
|
|
|
678cae6 |
- if !$parent_result || !OpenQA::Jobs::Constants::is_ok_result($parent_result);
|
|
|
678cae6 |
+ if !$parent_result || grep { $parent_result eq $_ } NOT_OK_RESULTS;
|
|
|
678cae6 |
}
|
|
|
678cae6 |
next;
|
|
|
678cae6 |
}
|
|
|
678cae6 |
@@ -861,13 +860,13 @@ for PARALLEL dependencies:
|
|
|
678cae6 |
+ if child is clone, find the latest clone and clone it
|
|
|
678cae6 |
|
|
|
678cae6 |
for CHAINED dependencies:
|
|
|
678cae6 |
-- do NOT clone parents
|
|
|
678cae6 |
+- only clone failed parents, ignoring their children (our siblings, but also potentially our cousins from other child groups)
|
|
|
678cae6 |
+ create new dependency - duplicit cloning is prevented by ignorelist, webui will show multiple chained deps though
|
|
|
678cae6 |
- clone children
|
|
|
678cae6 |
+ if child is clone, find the latest clone and clone it
|
|
|
678cae6 |
|
|
|
678cae6 |
for DIRECTLY_CHAINED dependencies:
|
|
|
678cae6 |
-- clone parents recursively but ignore their children (our siblings)
|
|
|
678cae6 |
+- clone parents recursively but ignore their children (our siblings, but also potentially our cousins from other child groups)
|
|
|
678cae6 |
+ if parent is clone, find the latest clone and clone it
|
|
|
678cae6 |
- clone children
|
|
|
678cae6 |
+ if child is clone, find the latest clone and clone it
|
|
|
678cae6 |
diff --git a/t/05-scheduler-restart-and-duplicate.t b/t/05-scheduler-restart-and-duplicate.t
|
|
|
678cae6 |
index 03a8b9c40..a4296184a 100644
|
|
|
678cae6 |
--- a/t/05-scheduler-restart-and-duplicate.t
|
|
|
678cae6 |
+++ b/t/05-scheduler-restart-and-duplicate.t
|
|
|
678cae6 |
@@ -133,12 +133,13 @@ subtest 'restart with (directly) chained child' => sub {
|
|
|
678cae6 |
);
|
|
|
678cae6 |
my $job_data_99926;
|
|
|
678cae6 |
subtest 'cluster jobs for 99937 which has one chained child and one chained parent' => sub {
|
|
|
678cae6 |
+ job_get_rs(99926)->update({result => FAILED});
|
|
|
678cae6 |
is_deeply job_get_rs(99937)->cluster_jobs, \%expected_cluster,
|
|
|
678cae6 |
- 'chained parent considered for restarting as its result is not ok';
|
|
|
678cae6 |
+ 'chained parent considered for restarting as its result is failed';
|
|
|
678cae6 |
job_get_rs(99926)->update({result => SOFTFAILED});
|
|
|
678cae6 |
$job_data_99926 = delete $expected_cluster{99926};
|
|
|
678cae6 |
is_deeply job_get_rs(99937)->cluster_jobs, \%expected_cluster,
|
|
|
678cae6 |
- 'only child considered for restarting as the parent result is ok';
|
|
|
678cae6 |
+ 'only child considered for restarting as the parent result is not failed';
|
|
|
678cae6 |
};
|
|
|
678cae6 |
|
|
|
678cae6 |
# restart the job
|
|
|
678cae6 |
--
|
|
|
678cae6 |
2.39.0
|
|
|
678cae6 |
|