Mesa (main): ci/lava: Detect R8152 issues preemptively and retry

GitLab Mirror gitlab-mirror at kemper.freedesktop.org
Fri Jul 8 12:54:57 UTC 2022


Module: Mesa
Branch: main
Commit: 2c51b7a9c975199ea22b560efd2e2da94cdab47f
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=2c51b7a9c975199ea22b560efd2e2da94cdab47f

Author: Guilherme Gallo <guilherme.gallo at collabora.com>
Date:   Wed Jul  6 22:59:11 2022 -0300

ci/lava: Detect R8152 issues preemptively and retry

Implement a log-based retry hint for R8152 issue described in #6681,
which is based on detecting these two consecutive lines:

```
r8152 <USB> eth0: Tx status -71
nfs: server <IP> not responding, still trying
```

Where <IP> and <USB> could be any IP and USB addresses, respectfully.

This commit is a temporary fix since it requires a section-aware log
follower, implemented in !16323. When the cited MR is merged, one will
make a proper fix on top of that.

Closes: #6681

Signed-off-by: Guilherme Gallo <guilherme.gallo at collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17389>

---

 .gitlab-ci/lava/exceptions.py           |  7 ++++
 .gitlab-ci/lava/utils/lava_log_hints.py | 40 ++++++++++++++++++++++
 .gitlab-ci/lava/utils/log_follower.py   |  5 +++
 .gitlab-ci/tests/utils/test_lava_log.py | 59 ++++++++++++++++++++++++++++++++-
 4 files changed, 110 insertions(+), 1 deletion(-)

diff --git a/.gitlab-ci/lava/exceptions.py b/.gitlab-ci/lava/exceptions.py
index 1c1db591897..3c9a63eb3c6 100644
--- a/.gitlab-ci/lava/exceptions.py
+++ b/.gitlab-ci/lava/exceptions.py
@@ -19,3 +19,10 @@ class MesaCIRetryError(MesaCIException):
 
 class MesaCIParseException(MesaCIException):
     pass
+
+
+class MesaCIKnownIssueException(MesaCIException):
+    """Exception raised when the Mesa CI script finds something in the logs that
+    is known to cause the LAVA job to eventually fail"""
+
+    pass
diff --git a/.gitlab-ci/lava/utils/lava_log_hints.py b/.gitlab-ci/lava/utils/lava_log_hints.py
new file mode 100644
index 00000000000..12317bb1127
--- /dev/null
+++ b/.gitlab-ci/lava/utils/lava_log_hints.py
@@ -0,0 +1,40 @@
+from __future__ import annotations
+
+import re
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, Any
+
+if TYPE_CHECKING:
+    from lava.utils import LogFollower
+
+from lava.exceptions import MesaCIKnownIssueException
+from lava.utils.log_section import LogSectionType
+
+
+ at dataclass
+class LAVALogHints:
+    log_follower: LogFollower
+    has_r8152_issue_history: bool = field(default=False, init=False)
+
+    def detect_failure(self, new_lines: list[dict[str, Any]]):
+        for line in new_lines:
+            self.detect_r8152_issue(line)
+
+    def detect_r8152_issue(self, line):
+        if (
+            self.log_follower.phase == LogSectionType.TEST_CASE
+            and line["lvl"] == "target"
+        ):
+            if re.search(r"r8152 \S+ eth0: Tx status -71", line["msg"]):
+                self.has_r8152_issue_history = True
+                return
+
+            if self.has_r8152_issue_history and re.search(
+                r"nfs: server \d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3} not responding, still trying",
+                line["msg"],
+            ):
+                raise MesaCIKnownIssueException(
+                    "Probable network issue failure encountered, retrying the job"
+                )
+
+        self.has_r8152_issue_history = False
diff --git a/.gitlab-ci/lava/utils/log_follower.py b/.gitlab-ci/lava/utils/log_follower.py
index 532610dfba2..d08923bf07b 100644
--- a/.gitlab-ci/lava/utils/log_follower.py
+++ b/.gitlab-ci/lava/utils/log_follower.py
@@ -20,6 +20,7 @@ from typing import Optional, Union
 from lava.exceptions import MesaCITimeoutError
 from lava.utils.console_format import CONSOLE_LOG
 from lava.utils.gitlab_section import GitlabSection
+from lava.utils.lava_log_hints import LAVALogHints
 from lava.utils.log_section import (
     DEFAULT_GITLAB_SECTION_TIMEOUTS,
     FALLBACK_GITLAB_SECTION_TIMEOUT,
@@ -36,12 +37,14 @@ class LogFollower:
     )
     fallback_timeout: timedelta = FALLBACK_GITLAB_SECTION_TIMEOUT
     _buffer: list[str] = field(default_factory=list, init=False)
+    log_hints: LAVALogHints = field(init=False)
 
     def __post_init__(self):
         section_is_created = bool(self.current_section)
         section_has_started = bool(
             self.current_section and self.current_section.has_started
         )
+        self.log_hints = LAVALogHints(self)
         assert (
             section_is_created == section_has_started
         ), "Can't follow logs beginning from uninitialized GitLab sections."
@@ -138,6 +141,8 @@ class LogFollower:
             if parsed_line := parse_lava_line(line):
                 self._buffer.append(parsed_line)
 
+        self.log_hints.detect_failure(new_lines)
+
         return is_job_healthy
 
     def flush(self) -> list[str]:
diff --git a/.gitlab-ci/tests/utils/test_lava_log.py b/.gitlab-ci/tests/utils/test_lava_log.py
index cc5e8eec9bd..1219cc9f830 100644
--- a/.gitlab-ci/tests/utils/test_lava_log.py
+++ b/.gitlab-ci/tests/utils/test_lava_log.py
@@ -9,7 +9,7 @@ from datetime import datetime, timedelta
 
 import pytest
 import yaml
-from lava.exceptions import MesaCITimeoutError
+from lava.exceptions import MesaCIKnownIssueException, MesaCITimeoutError
 from lava.utils import (
     GitlabSection,
     LogFollower,
@@ -290,3 +290,60 @@ def test_gitlab_section_id(case_name, expected_id):
 
     assert gl.id == expected_id
 
+
+A618_NETWORK_ISSUE_LOGS = [
+    create_lava_yaml_msg(
+        msg="[ 1733.599402] r8152 2-1.3:1.0 eth0: Tx status -71", lvl="target"
+    ),
+    create_lava_yaml_msg(
+        msg="[ 1733.604506] nfs: server 192.168.201.1 not responding, still trying",
+        lvl="target",
+    ),
+]
+TEST_PHASE_LAVA_SIGNAL = create_lava_yaml_msg(
+    msg="Received signal: <STARTTC> mesa-ci_a618_vk", lvl="debug"
+)
+
+
+A618_NETWORK_ISSUE_SCENARIOS = {
+    "Pass - R8152 kmsg during boot": (A618_NETWORK_ISSUE_LOGS, does_not_raise()),
+    "Fail - R8152 kmsg during test phase": (
+        [TEST_PHASE_LAVA_SIGNAL, *A618_NETWORK_ISSUE_LOGS],
+        pytest.raises(MesaCIKnownIssueException),
+    ),
+    "Pass - Partial (1) R8152 kmsg during test phase": (
+        [TEST_PHASE_LAVA_SIGNAL, A618_NETWORK_ISSUE_LOGS[0]],
+        does_not_raise(),
+    ),
+    "Pass - Partial (2) R8152 kmsg during test phase": (
+        [TEST_PHASE_LAVA_SIGNAL, A618_NETWORK_ISSUE_LOGS[1]],
+        does_not_raise(),
+    ),
+    "Pass - Partial subsequent (3) R8152 kmsg during test phase": (
+        [
+            TEST_PHASE_LAVA_SIGNAL,
+            A618_NETWORK_ISSUE_LOGS[0],
+            A618_NETWORK_ISSUE_LOGS[0],
+        ],
+        does_not_raise(),
+    ),
+    "Pass - Partial subsequent (4) R8152 kmsg during test phase": (
+        [
+            TEST_PHASE_LAVA_SIGNAL,
+            A618_NETWORK_ISSUE_LOGS[1],
+            A618_NETWORK_ISSUE_LOGS[1],
+        ],
+        does_not_raise(),
+    ),
+}
+
+
+ at pytest.mark.parametrize(
+    "messages, expectation",
+    A618_NETWORK_ISSUE_SCENARIOS.values(),
+    ids=A618_NETWORK_ISSUE_SCENARIOS.keys(),
+)
+def test_detect_failure(messages, expectation):
+    lf = LogFollower()
+    with expectation:
+        lf.feed(messages)



More information about the mesa-commit mailing list