Mesa (main): gitlab-ci: detect a3xx gpu hang recovery failure
GitLab Mirror
gitlab-mirror at kemper.freedesktop.org
Fri Dec 3 23:52:02 UTC 2021
Module: Mesa
Branch: main
Commit: 268fc8e5c175f5759ffc180c855d0e1ac63ea2a9
URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=268fc8e5c175f5759ffc180c855d0e1ac63ea2a9
Author: Ilia Mirkin <imirkin at alum.mit.edu>
Date: Tue Nov 30 22:52:49 2021 -0500
gitlab-ci: detect a3xx gpu hang recovery failure
But don't bail immediately, instead print out some more lines after the
hang, hopefully catching info about the cause of the hang.
Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu>
Reviewed-by: Emma Anholt <emma at anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14033>
---
.gitlab-ci/bare-metal/fastboot_run.py | 18 ++++++++++++++++++
1 file changed, 18 insertions(+)
diff --git a/.gitlab-ci/bare-metal/fastboot_run.py b/.gitlab-ci/bare-metal/fastboot_run.py
index 9fb2cb36b2b..0521a387da4 100755
--- a/.gitlab-ci/bare-metal/fastboot_run.py
+++ b/.gitlab-ci/bare-metal/fastboot_run.py
@@ -70,7 +70,13 @@ class FastbootRun:
if self.logged_system(self.fastboot) != 0:
return 1
+ print_more_lines = -1
for line in self.ser.lines():
+ if print_more_lines == 0:
+ return 2
+ if print_more_lines > 0:
+ print_more_lines -= 1
+
if re.search("---. end Kernel panic", line):
return 1
@@ -92,6 +98,18 @@ class FastbootRun:
"Detected network device failure, restarting run...")
return 2
+ # A3xx recovery doesn't quite work. Sometimes the GPU will get
+ # wedged and recovery will fail (because power can't be reset?)
+ # This assumes that the jobs are sufficiently well-tested that GPU
+ # hangs aren't always triggered, so just try again. But print some
+ # more lines first so that we get better information on the cause
+ # of the hang. Once a hang happens, it's pretty chatty.
+ if "[drm:adreno_recover] *ERROR* gpu hw init failed: -22" in line:
+ self.print_error(
+ "Detected GPU hang, restarting run...")
+ if print_more_lines == -1:
+ print_more_lines = 30
+
result = re.search("hwci: mesa: (\S*)", line)
if result:
if result.group(1) == "pass":
More information about the mesa-commit
mailing list