Mesa (main): ci: Make LAVA jobs fail CI job when retry is exhausted

Thu Feb 17 00:00:31 UTC 2022

Module: Mesa
Branch: main
Commit: addac10443ac1dbe2405e4ce1652cfe681dcc46e
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=addac10443ac1dbe2405e4ce1652cfe681dcc46e

Author: Guilherme Gallo <guilherme.gallo at collabora.com>
Date:   Wed Feb 16 01:07:57 2022 -0300

ci: Make LAVA jobs fail CI job when retry is exhausted

When the lava_job_submitter.py retry loop finishes normally (without
falling through break-loop) it means that the submitter has exceeded the
retry count limit. However, when it happens the script
finishes normally. This patch adds a treatment to this case, warning the
user what happened and forcing the job to fail.

Moreover, this commit will make retry configurations configurable by
CI job, as it can take the default value from the following variables:

- LAVA_DEVICE_HANGING_TIMEOUT_SEC
- LAVA_WAIT_FOR_DEVICE_POLLING_TIME_SEC
- LAVA_LOG_POLLING_TIME_SEC
- LAVA_NUMBER_OF_RETRIES_TIMEOUT_DETECTION

Signed-off-by: Guilherme Gallo <guilherme.gallo at collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14876>

---

 .gitlab-ci/lava/lava_job_submitter.py | 79 +++++++++++++++++++----------------
 1 file changed, 43 insertions(+), 36 deletions(-)

diff --git a/.gitlab-ci/lava/lava_job_submitter.py b/.gitlab-ci/lava/lava_job_submitter.py
index 5d1f469e7c6..a83662afd5c 100755
--- a/.gitlab-ci/lava/lava_job_submitter.py
+++ b/.gitlab-ci/lava/lava_job_submitter.py
@@ -31,25 +31,27 @@ import time
 import traceback
 import urllib.parse
 import xmlrpc
+
 from datetime import datetime, timedelta
+from os import getenv
 
 import lavacli
 import yaml
 from lavacli.utils import loader
 
-# Timeout in minutes to decide if the device from the dispatched LAVA job has
+# Timeout in seconds to decide if the device from the dispatched LAVA job has
 # hung or not due to the lack of new log output.
-DEVICE_HANGING_TIMEOUT_MIN = 5
+DEVICE_HANGING_TIMEOUT_SEC = int(getenv("LAVA_DEVICE_HANGING_TIMEOUT_SEC",  5*60))
 
 # How many seconds the script should wait before try a new polling iteration to
 # check if the dispatched LAVA job is running or waiting in the job queue.
-WAIT_FOR_DEVICE_POLLING_TIME_SEC = 10
+WAIT_FOR_DEVICE_POLLING_TIME_SEC = int(getenv("LAVA_WAIT_FOR_DEVICE_POLLING_TIME_SEC", 10))
 
 # How many seconds to wait between log output LAVA RPC calls.
-LOG_POLLING_TIME_SEC = 5
+LOG_POLLING_TIME_SEC = int(getenv("LAVA_LOG_POLLING_TIME_SEC", 5))
 
 # How many retries should be made when a timeout happen.
-NUMBER_OF_RETRIES_TIMEOUT_DETECTION = 2
+NUMBER_OF_RETRIES_TIMEOUT_DETECTION = int(getenv("LAVA_NUMBER_OF_RETRIES_TIMEOUT_DETECTION", 2))
 
 
 def print_log(msg):
@@ -61,14 +63,7 @@ def fatal_err(msg):
 
 
 def hide_sensitive_data(yaml_data, hide_tag="HIDEME"):
-    out_data = ""
-
-    for line in yaml_data.splitlines(True):
-        if hide_tag in line:
-            continue
-        out_data += line
-
-    return out_data
+    return "".join(line for line in yaml_data.splitlines(True) if hide_tag not in line)
 
 
 def generate_lava_yaml(args):
@@ -211,7 +206,6 @@ def _call_proxy(fn, *args):
                 fatal_err("A protocol error occurred (Err {} {})".format(err.errcode, err.errmsg))
             else:
                 time.sleep(15)
-                pass
         except xmlrpc.client.Fault as err:
             traceback.print_exc()
             fatal_err("FATAL: Fault: {} (code: {})".format(err.faultString, err.faultCode))
@@ -222,8 +216,8 @@ def get_job_results(proxy, job_id, test_suite, test_case):
     results_yaml = _call_proxy(proxy.results.get_testjob_results_yaml, job_id)
     results = yaml.load(results_yaml, Loader=loader(False))
     for res in results:
-        metadata = res['metadata']
-        if not 'result' in metadata or metadata['result'] != 'fail':
+        metadata = res["metadata"]
+        if "result" not in metadata or metadata["result"] != "fail":
             continue
         if 'error_type' in metadata and metadata['error_type'] == "Infrastructure":
             print_log("LAVA job {} failed with Infrastructure Error. Retry.".format(job_id))
@@ -260,8 +254,7 @@ def follow_job_execution(proxy, job_id):
     last_time_logs = datetime.now()
     while not finished:
         (finished, data) = _call_proxy(proxy.scheduler.jobs.logs, job_id, line_count)
-        logs = yaml.load(str(data), Loader=loader(False))
-        if logs:
+        if logs := yaml.load(str(data), Loader=loader(False)):
             # Reset the timeout
             last_time_logs = datetime.now()
             for line in logs:
@@ -270,7 +263,7 @@ def follow_job_execution(proxy, job_id):
             line_count += len(logs)
 
         else:
-            time_limit = timedelta(minutes=DEVICE_HANGING_TIMEOUT_MIN)
+            time_limit = timedelta(seconds=DEVICE_HANGING_TIMEOUT_SEC)
             if datetime.now() - last_time_logs > time_limit:
                 print_log("LAVA job {} doesn't advance (machine got hung?). Retry.".format(job_id))
                 return False
@@ -298,21 +291,7 @@ def submit_job(proxy, job_file):
     return _call_proxy(proxy.scheduler.jobs.submit, job_file)
 
 
-def main(args):
-    proxy = setup_lava_proxy()
-
-    yaml_file = generate_lava_yaml(args)
-
-    if args.dump_yaml:
-        print(hide_sensitive_data(generate_lava_yaml(args)))
-
-    if args.validate_only:
-        ret = validate_job(proxy, yaml_file)
-        if not ret:
-            fatal_err("Error in LAVA job definition")
-        print("LAVA job definition validated successfully")
-        return
-
+def retriable_follow_job(proxy, yaml_file):
     retry_count = NUMBER_OF_RETRIES_TIMEOUT_DETECTION
 
     while retry_count >= 0:
@@ -332,8 +311,36 @@ def main(args):
 
         show_job_data(proxy, job_id)
 
-        if get_job_results(proxy,  job_id, "0_mesa", "mesa") == True:
-             break
+        if get_job_results(proxy, job_id, "0_mesa", "mesa") == True:
+            break
+    else:
+        # The script attempted all the retries. The job seemed to fail.
+        return False
+
+    return True
+
+
+def main(args):
+    proxy = setup_lava_proxy()
+
+    yaml_file = generate_lava_yaml(args)
+
+    if args.dump_yaml:
+        print(hide_sensitive_data(generate_lava_yaml(args)))
+
+    if args.validate_only:
+        ret = validate_job(proxy, yaml_file)
+        if not ret:
+            fatal_err("Error in LAVA job definition")
+        print("LAVA job definition validated successfully")
+        return
+
+    if not retriable_follow_job(proxy, yaml_file):
+        fatal_err(
+            "Job failed after it exceeded the number of"
+            f"{NUMBER_OF_RETRIES_TIMEOUT_DETECTION} retries."
+        )
+
 
 def create_parser():
     parser = argparse.ArgumentParser("LAVA job submitter")