Mesa (main): ci/lava: Retry when data fetching log RPC call is corrupted
GitLab Mirror
gitlab-mirror at kemper.freedesktop.org
Thu Apr 28 07:34:16 UTC 2022
Module: Mesa
Branch: main
Commit: 201b0b6d2948eedfd1821e5173ecba09f0d453c6
URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=201b0b6d2948eedfd1821e5173ecba09f0d453c6
Author: Guilherme Gallo <guilherme.gallo at collabora.com>
Date: Wed Apr 20 20:55:47 2022 -0300
ci/lava: Retry when data fetching log RPC call is corrupted
Rarely the jobs.logs RPC call can return corrupted data, such as
mal-formed YAML data. As this is expected and very rare to occur, let's
retry this RPC call several times to give it a chance to fix itself.
Retrying would not swallow the log lines since we keep track of how many
log lines each job has.
Signed-off-by: Guilherme Gallo <guilherme.gallo at collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15938>
---
.gitlab-ci/lava/exceptions.py | 6 ++++-
.gitlab-ci/lava/lava_job_submitter.py | 39 ++++++++++++++++++++++-------
.gitlab-ci/tests/test_lava_job_submitter.py | 31 +++++++++++++++++++++--
3 files changed, 64 insertions(+), 12 deletions(-)
diff --git a/.gitlab-ci/lava/exceptions.py b/.gitlab-ci/lava/exceptions.py
index 576924f2b7d..1c1db591897 100644
--- a/.gitlab-ci/lava/exceptions.py
+++ b/.gitlab-ci/lava/exceptions.py
@@ -14,4 +14,8 @@ class MesaCITimeoutError(MesaCIException):
class MesaCIRetryError(MesaCIException):
def __init__(self, *args, retry_count: int) -> None:
super().__init__(*args)
- self.retry_count = retry_count
\ No newline at end of file
+ self.retry_count = retry_count
+
+
+class MesaCIParseException(MesaCIException):
+ pass
diff --git a/.gitlab-ci/lava/lava_job_submitter.py b/.gitlab-ci/lava/lava_job_submitter.py
index cc5063b2727..fb4b04420c2 100755
--- a/.gitlab-ci/lava/lava_job_submitter.py
+++ b/.gitlab-ci/lava/lava_job_submitter.py
@@ -24,7 +24,9 @@
"""Send a job to LAVA, track it and collect log back"""
+
import argparse
+import contextlib
import pathlib
import re
import sys
@@ -38,7 +40,12 @@ from typing import Any, Optional
import lavacli
import yaml
-from lava.exceptions import MesaCIException, MesaCIRetryError, MesaCITimeoutError
+from lava.exceptions import (
+ MesaCIException,
+ MesaCIParseException,
+ MesaCIRetryError,
+ MesaCITimeoutError,
+)
from lavacli.utils import loader
# Timeout in seconds to decide if the device from the dispatched LAVA job has
@@ -264,20 +271,26 @@ class LAVAJob:
)
return job_state["job_state"] not in waiting_states
+ def _load_log_from_data(self, data) -> list[str]:
+ lines = []
+ # When there is no new log data, the YAML is empty
+ if loaded_lines := yaml.load(str(data), Loader=loader(False)):
+ lines = loaded_lines
+ # If we had non-empty log data, we can assure that the device is alive.
+ self.heartbeat()
+ self.last_log_line += len(lines)
+ return lines
+
def get_logs(self) -> list[str]:
try:
(finished, data) = _call_proxy(
self.proxy.scheduler.jobs.logs, self.job_id, self.last_log_line
)
- lines = yaml.load(str(data), Loader=loader(False))
self.is_finished = finished
- if not lines:
- return []
- self.heartbeat()
- self.last_log_line += len(lines)
- return lines
+ return self._load_log_from_data(data)
+
except Exception as mesa_ci_err:
- raise MesaCIException(
+ raise MesaCIParseException(
f"Could not get LAVA job logs. Reason: {mesa_ci_err}"
) from mesa_ci_err
@@ -381,7 +394,15 @@ def fetch_logs(job, max_idle_time):
time.sleep(LOG_POLLING_TIME_SEC)
- new_lines = job.get_logs()
+ # The XMLRPC binary packet may be corrupted, causing a YAML scanner error.
+ # Retry the log fetching several times before exposing the error.
+ for _ in range(5):
+ with contextlib.suppress(MesaCIParseException):
+ new_lines = job.get_logs()
+ break
+ else:
+ raise MesaCIParseException
+
parsed_lines = parse_lava_lines(new_lines)
for line in parsed_lines:
diff --git a/.gitlab-ci/tests/test_lava_job_submitter.py b/.gitlab-ci/tests/test_lava_job_submitter.py
index d2f900a2794..49ad219d3e4 100644
--- a/.gitlab-ci/tests/test_lava_job_submitter.py
+++ b/.gitlab-ci/tests/test_lava_job_submitter.py
@@ -168,7 +168,9 @@ PROXY_SCENARIOS = {
{},
),
"timed out more times than retry attempts": (
- generate_n_logs(n=4, tick_fn=DEVICE_HANGING_TIMEOUT_SEC + 1),
+ generate_n_logs(
+ n=NUMBER_OF_MAX_ATTEMPTS + 1, tick_fn=DEVICE_HANGING_TIMEOUT_SEC + 1
+ ),
pytest.raises(MesaCIRetryError),
False,
{},
@@ -316,4 +318,29 @@ def test_hide_sensitive_data(input, expectation, tag):
def test_get_job_results(mock_proxy):
proxy = mock_proxy()
- get_job_results(proxy, 1, "0_mesa")
\ No newline at end of file
+ get_job_results(proxy, 1, "0_mesa")
+
+
+CORRUPTED_LOG_SCENARIOS = {
+ "too much subsequent corrupted data": (
+ [(False, "{'msg': 'Incomplete}")] * 100 + [jobs_logs_response(True)],
+ pytest.raises((MesaCIRetryError)),
+ ),
+ "one subsequent corrupted data": (
+ [(False, "{'msg': 'Incomplete}")] * 2 + [jobs_logs_response(True)],
+ does_not_raise(),
+ ),
+}
+
+
+ at pytest.mark.parametrize(
+ "data_sequence, expected_exception",
+ CORRUPTED_LOG_SCENARIOS.values(),
+ ids=CORRUPTED_LOG_SCENARIOS.keys(),
+)
+def test_log_corruption(mock_sleep, data_sequence, expected_exception, mock_proxy):
+ proxy_mock = mock_proxy()
+ proxy_logs_mock = proxy_mock.scheduler.jobs.logs
+ proxy_logs_mock.side_effect = data_sequence
+ with expected_exception:
+ retriable_follow_job(proxy_mock, "")
More information about the mesa-commit
mailing list