Mesa (main): ci/lava: Retry when data fetching log RPC call is corrupted

GitLab Mirror gitlab-mirror at kemper.freedesktop.org
Thu Apr 28 07:34:16 UTC 2022


Module: Mesa
Branch: main
Commit: 201b0b6d2948eedfd1821e5173ecba09f0d453c6
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=201b0b6d2948eedfd1821e5173ecba09f0d453c6

Author: Guilherme Gallo <guilherme.gallo at collabora.com>
Date:   Wed Apr 20 20:55:47 2022 -0300

ci/lava: Retry when data fetching log RPC call is corrupted

Rarely the jobs.logs RPC call can return corrupted data, such as
mal-formed YAML data. As this is expected and very rare to occur, let's
retry this RPC call several times to give it a chance to fix itself.

Retrying would not swallow the log lines since we keep track of how many
log lines each job has.

Signed-off-by: Guilherme Gallo <guilherme.gallo at collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15938>

---

 .gitlab-ci/lava/exceptions.py               |  6 ++++-
 .gitlab-ci/lava/lava_job_submitter.py       | 39 ++++++++++++++++++++++-------
 .gitlab-ci/tests/test_lava_job_submitter.py | 31 +++++++++++++++++++++--
 3 files changed, 64 insertions(+), 12 deletions(-)

diff --git a/.gitlab-ci/lava/exceptions.py b/.gitlab-ci/lava/exceptions.py
index 576924f2b7d..1c1db591897 100644
--- a/.gitlab-ci/lava/exceptions.py
+++ b/.gitlab-ci/lava/exceptions.py
@@ -14,4 +14,8 @@ class MesaCITimeoutError(MesaCIException):
 class MesaCIRetryError(MesaCIException):
     def __init__(self, *args, retry_count: int) -> None:
         super().__init__(*args)
-        self.retry_count = retry_count
\ No newline at end of file
+        self.retry_count = retry_count
+
+
+class MesaCIParseException(MesaCIException):
+    pass
diff --git a/.gitlab-ci/lava/lava_job_submitter.py b/.gitlab-ci/lava/lava_job_submitter.py
index cc5063b2727..fb4b04420c2 100755
--- a/.gitlab-ci/lava/lava_job_submitter.py
+++ b/.gitlab-ci/lava/lava_job_submitter.py
@@ -24,7 +24,9 @@
 
 """Send a job to LAVA, track it and collect log back"""
 
+
 import argparse
+import contextlib
 import pathlib
 import re
 import sys
@@ -38,7 +40,12 @@ from typing import Any, Optional
 
 import lavacli
 import yaml
-from lava.exceptions import MesaCIException, MesaCIRetryError, MesaCITimeoutError
+from lava.exceptions import (
+    MesaCIException,
+    MesaCIParseException,
+    MesaCIRetryError,
+    MesaCITimeoutError,
+)
 from lavacli.utils import loader
 
 # Timeout in seconds to decide if the device from the dispatched LAVA job has
@@ -264,20 +271,26 @@ class LAVAJob:
         )
         return job_state["job_state"] not in waiting_states
 
+    def _load_log_from_data(self, data) -> list[str]:
+        lines = []
+        # When there is no new log data, the YAML is empty
+        if loaded_lines := yaml.load(str(data), Loader=loader(False)):
+            lines = loaded_lines
+            # If we had non-empty log data, we can assure that the device is alive.
+            self.heartbeat()
+            self.last_log_line += len(lines)
+        return lines
+
     def get_logs(self) -> list[str]:
         try:
             (finished, data) = _call_proxy(
                 self.proxy.scheduler.jobs.logs, self.job_id, self.last_log_line
             )
-            lines = yaml.load(str(data), Loader=loader(False))
             self.is_finished = finished
-            if not lines:
-                return []
-            self.heartbeat()
-            self.last_log_line += len(lines)
-            return lines
+            return self._load_log_from_data(data)
+
         except Exception as mesa_ci_err:
-            raise MesaCIException(
+            raise MesaCIParseException(
                 f"Could not get LAVA job logs. Reason: {mesa_ci_err}"
             ) from mesa_ci_err
 
@@ -381,7 +394,15 @@ def fetch_logs(job, max_idle_time):
 
     time.sleep(LOG_POLLING_TIME_SEC)
 
-    new_lines = job.get_logs()
+    # The XMLRPC binary packet may be corrupted, causing a YAML scanner error.
+    # Retry the log fetching several times before exposing the error.
+    for _ in range(5):
+        with contextlib.suppress(MesaCIParseException):
+            new_lines = job.get_logs()
+            break
+    else:
+        raise MesaCIParseException
+
     parsed_lines = parse_lava_lines(new_lines)
 
     for line in parsed_lines:
diff --git a/.gitlab-ci/tests/test_lava_job_submitter.py b/.gitlab-ci/tests/test_lava_job_submitter.py
index d2f900a2794..49ad219d3e4 100644
--- a/.gitlab-ci/tests/test_lava_job_submitter.py
+++ b/.gitlab-ci/tests/test_lava_job_submitter.py
@@ -168,7 +168,9 @@ PROXY_SCENARIOS = {
         {},
     ),
     "timed out more times than retry attempts": (
-        generate_n_logs(n=4, tick_fn=DEVICE_HANGING_TIMEOUT_SEC + 1),
+        generate_n_logs(
+            n=NUMBER_OF_MAX_ATTEMPTS + 1, tick_fn=DEVICE_HANGING_TIMEOUT_SEC + 1
+        ),
         pytest.raises(MesaCIRetryError),
         False,
         {},
@@ -316,4 +318,29 @@ def test_hide_sensitive_data(input, expectation, tag):
 
 def test_get_job_results(mock_proxy):
     proxy = mock_proxy()
-    get_job_results(proxy, 1, "0_mesa")
\ No newline at end of file
+    get_job_results(proxy, 1, "0_mesa")
+
+
+CORRUPTED_LOG_SCENARIOS = {
+    "too much subsequent corrupted data": (
+        [(False, "{'msg': 'Incomplete}")] * 100 + [jobs_logs_response(True)],
+        pytest.raises((MesaCIRetryError)),
+    ),
+    "one subsequent corrupted data": (
+        [(False, "{'msg': 'Incomplete}")] * 2 + [jobs_logs_response(True)],
+        does_not_raise(),
+    ),
+}
+
+
+ at pytest.mark.parametrize(
+    "data_sequence, expected_exception",
+    CORRUPTED_LOG_SCENARIOS.values(),
+    ids=CORRUPTED_LOG_SCENARIOS.keys(),
+)
+def test_log_corruption(mock_sleep, data_sequence, expected_exception, mock_proxy):
+    proxy_mock = mock_proxy()
+    proxy_logs_mock = proxy_mock.scheduler.jobs.logs
+    proxy_logs_mock.side_effect = data_sequence
+    with expected_exception:
+        retriable_follow_job(proxy_mock, "")



More information about the mesa-commit mailing list