From: Ilia Mirkin Date: Wed, 1 Dec 2021 03:52:49 +0000 (-0500) Subject: gitlab-ci: detect a3xx gpu hang recovery failure X-Git-Tag: upstream/22.3.5~15179 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=268fc8e5c175f5759ffc180c855d0e1ac63ea2a9;p=platform%2Fupstream%2Fmesa.git gitlab-ci: detect a3xx gpu hang recovery failure But don't bail immediately, instead print out some more lines after the hang, hopefully catching info about the cause of the hang. Signed-off-by: Ilia Mirkin Reviewed-by: Emma Anholt Part-of: --- diff --git a/.gitlab-ci/bare-metal/fastboot_run.py b/.gitlab-ci/bare-metal/fastboot_run.py index 9fb2cb3..0521a38 100755 --- a/.gitlab-ci/bare-metal/fastboot_run.py +++ b/.gitlab-ci/bare-metal/fastboot_run.py @@ -70,7 +70,13 @@ class FastbootRun: if self.logged_system(self.fastboot) != 0: return 1 + print_more_lines = -1 for line in self.ser.lines(): + if print_more_lines == 0: + return 2 + if print_more_lines > 0: + print_more_lines -= 1 + if re.search("---. end Kernel panic", line): return 1 @@ -92,6 +98,18 @@ class FastbootRun: "Detected network device failure, restarting run...") return 2 + # A3xx recovery doesn't quite work. Sometimes the GPU will get + # wedged and recovery will fail (because power can't be reset?) + # This assumes that the jobs are sufficiently well-tested that GPU + # hangs aren't always triggered, so just try again. But print some + # more lines first so that we get better information on the cause + # of the hang. Once a hang happens, it's pretty chatty. + if "[drm:adreno_recover] *ERROR* gpu hw init failed: -22" in line: + self.print_error( + "Detected GPU hang, restarting run...") + if print_more_lines == -1: + print_more_lines = 30 + result = re.search("hwci: mesa: (\S*)", line) if result: if result.group(1) == "pass":