From 678cae67569ac6e04fb0a4ca385670928b939683 Mon Sep 17 00:00:00 2001 From: Adam Williamson Date: Dec 19 2022 21:03:17 +0000 Subject: Backport #4498 to solve a problem with retrying grouped tests --- diff --git a/0001-Don-t-restart-scheduled-or-running-chained-parents.patch b/0001-Don-t-restart-scheduled-or-running-chained-parents.patch new file mode 100644 index 0000000..69f6f3f --- /dev/null +++ b/0001-Don-t-restart-scheduled-or-running-chained-parents.patch @@ -0,0 +1,95 @@ +From 60cc55aa03583988751144a9f47024a110b4a9bb Mon Sep 17 00:00:00 2001 +From: Adam Williamson +Date: Mon, 19 Dec 2022 10:52:28 -0800 +Subject: [PATCH] Don't restart scheduled or running chained parents + +This is a follow-up to @martchus #4498. The intent of that PR, +as the commit message says, is to "Restart failed chained parents +"up the chain"". However, it actually does more than this: it +also restarts chained parents that are queued or running, because +they also do not have "OK" results. This tweaks the check a bit +so we only restart parents if they have a result which is known +to be not OK. This leaves out NONE, which is the result that +scheduled or running tests have. + +In a way we cannot know whether restarting a chained parent that +is still running is the right thing to do, but doing so definitely +causes problems in one real-world case: +https://progress.opensuse.org/projects/openqav3/activity?from=2022-12-19 +and from the examples backing the original change here, it seems +we only really intended to add the restart behaviour when the +parent test definitely failed, not when its result was not yet +decided. So on the principle of making minimal changes to the +previous behaviour, it seems reasonable to make this change, +which in fact restores the behaviour from before #4498 for +still-running parent jobs. The cases #4498 intended to address +with complex investigation jobs should still be addressed. + +This also updates the comments for `duplicate` to more accurately +reflect current behaviour, and highlight the problem that +ignoring children of a parent job can ignore not only other jobs +in the same parallel group as the child job, but other children of +the same parent that are *not* in the same parallel group as the +child job. + +Related progress issue: https://progress.opensuse.org/issues/110458 +Related progress issue: https://progress.opensuse.org/issues/112256 + +Signed-off-by: Adam Williamson +--- + lib/OpenQA/Schema/Result/Jobs.pm | 6 +++--- + t/05-scheduler-restart-and-duplicate.t | 5 +++-- + 2 files changed, 6 insertions(+), 5 deletions(-) + +diff --git a/lib/OpenQA/Schema/Result/Jobs.pm b/lib/OpenQA/Schema/Result/Jobs.pm +index b49a60661..0084777ea 100644 +--- a/lib/OpenQA/Schema/Result/Jobs.pm ++++ b/lib/OpenQA/Schema/Result/Jobs.pm +@@ -745,7 +744,7 @@ sub cluster_jobs ($self, @args) { + unless ($skip_parents || $cancelmode) { + my $parent_result = $p->result; + $p->cluster_jobs(jobs => $jobs, skip_children => 1, cancelmode => $cancelmode) +- if !$parent_result || !OpenQA::Jobs::Constants::is_ok_result($parent_result); ++ if !$parent_result || grep { $parent_result eq $_ } NOT_OK_RESULTS; + } + next; + } +@@ -861,13 +860,13 @@ for PARALLEL dependencies: + + if child is clone, find the latest clone and clone it + + for CHAINED dependencies: +-- do NOT clone parents ++- only clone failed parents, ignoring their children (our siblings, but also potentially our cousins from other child groups) + + create new dependency - duplicit cloning is prevented by ignorelist, webui will show multiple chained deps though + - clone children + + if child is clone, find the latest clone and clone it + + for DIRECTLY_CHAINED dependencies: +-- clone parents recursively but ignore their children (our siblings) ++- clone parents recursively but ignore their children (our siblings, but also potentially our cousins from other child groups) + + if parent is clone, find the latest clone and clone it + - clone children + + if child is clone, find the latest clone and clone it +diff --git a/t/05-scheduler-restart-and-duplicate.t b/t/05-scheduler-restart-and-duplicate.t +index 03a8b9c40..a4296184a 100644 +--- a/t/05-scheduler-restart-and-duplicate.t ++++ b/t/05-scheduler-restart-and-duplicate.t +@@ -133,12 +133,13 @@ subtest 'restart with (directly) chained child' => sub { + ); + my $job_data_99926; + subtest 'cluster jobs for 99937 which has one chained child and one chained parent' => sub { ++ job_get_rs(99926)->update({result => FAILED}); + is_deeply job_get_rs(99937)->cluster_jobs, \%expected_cluster, +- 'chained parent considered for restarting as its result is not ok'; ++ 'chained parent considered for restarting as its result is failed'; + job_get_rs(99926)->update({result => SOFTFAILED}); + $job_data_99926 = delete $expected_cluster{99926}; + is_deeply job_get_rs(99937)->cluster_jobs, \%expected_cluster, +- 'only child considered for restarting as the parent result is ok'; ++ 'only child considered for restarting as the parent result is not failed'; + }; + + # restart the job +-- +2.39.0 + diff --git a/openqa.spec b/openqa.spec index 38bed0f..459375e 100644 --- a/openqa.spec +++ b/openqa.spec @@ -92,7 +92,7 @@ Name: openqa Version: %{github_version}%{?github_date:^%{github_date}git%{shortcommit}} -Release: 2%{?dist} +Release: 3%{?dist} Summary: OS-level automated testing framework License: GPLv2+ Url: http://os-autoinst.github.io/openQA/ @@ -120,6 +120,12 @@ Source4: 23-fedora-messaging.t Source5: geekotest.conf Source6: openQA-worker.conf +# Try and tweak parent restart behaviour to avoid orphaning +# cockpit jobs when a FreeIPA child fails: +# https://progress.opensuse.org/issues/112256 +# https://progress.opensuse.org/issues/110458 +Patch0: 0001-Don-t-restart-scheduled-or-running-chained-parents.patch + BuildRequires: make BuildRequires: %{python_scripts_requires} BuildRequires: perl-generators @@ -669,6 +675,9 @@ fi %{_datadir}/openqa/lib/OpenQA/WebAPI/Plugin/FedoraUpdateRestart.pm %changelog +* Mon Dec 19 2022 Adam Williamson - 4.6^20221123gitb93eb7f-3 +- Backport #4498 (slightly rediffed) to solve a problem with retrying grouped tests + * Thu Dec 01 2022 Adam Williamson - 4.6^20221123gitb93eb7f-2 - FedoraMessaging: handle chunked ADVISORY_NVRS_N settings