[Ezbench-dev] [PATCH 1/2] smartezbench: resume incomplete runs, if possible

Thu Feb 2 09:12:54 UTC 2017

From: Martin Peres <martin.peres at linux.intel.com>

v2 (Petri): Allow resuming runs with subtests
---

First version wasn't resuming when a bisecting jammed the
machine. Churned the code a bit to just schedule all resumable tasks
to happen first, preferring the deployed version, and it seems to work
now.


python-modules/ezbench/smartezbench.py | 68 ++++++++++++++++++++++++++--------
 1 file changed, 52 insertions(+), 16 deletions(-)

diff --git a/python-modules/ezbench/smartezbench.py b/python-modules/ezbench/smartezbench.py
index 5f3474b..4e0f6dc 100644
--- a/python-modules/ezbench/smartezbench.py
+++ b/python-modules/ezbench/smartezbench.py
@@ -25,7 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 """
 
-from collections import namedtuple
+from collections import namedtuple, deque
 from datetime import datetime, timedelta
 from enum import Enum
 import numpy as np
@@ -92,10 +92,11 @@ def list_smart_ezbench_report_names(ezbench_dir, updatedSince = 0):
     return reports
 
 class TaskEntry:
-    def __init__(self, commit, test, rounds):
+    def __init__(self, commit, test, rounds, resumeResultFile = None):
         self.commit = commit
         self.test = test
         self.rounds = rounds
+        self.resumeResultFile = resumeResultFile
         self.start_date = None
         self.exec_time = None
         self.build_time = None
@@ -563,8 +564,8 @@ class SmartEzbench:
 
         return c, tl, self._events_str
 
-    def __prioritize_runs(self, task_tree, deployed_version):
-        task_list = list()
+    def __prioritize_runs(self, task_tree, deployed_version, resumable_tasks):
+        task_list = deque()
 
         # Aggregate all the subtests
         for commit in task_tree:
@@ -587,7 +588,22 @@ class SmartEzbench:
                 task_tree[commit]["tests"][full_name] = dict()
                 task_tree[commit]["tests"][full_name]["rounds"] = test_rounds[basename]
 
-        # Schedule the tests using the already-deployed version
+        # Schedule resumable tasks. First the already-deployed
+        # versions, other versions later
+        for task in resumable_tasks:
+            result_file = task.get("result_file", None)
+            if result_file is not None:
+                entry = TaskEntry(deployed_version, task["test"], 1, result_file)
+            else:
+                continue
+
+            if deployed_version == task["version"]:
+                task_list.appendleft(entry)
+            else:
+                task_list.append(entry)
+
+        # Schedule the tests using the already-deployed version after
+        # all resumable tasks
         if deployed_version is not None and deployed_version in task_tree:
             for test in task_tree[deployed_version]["tests"]:
                 rounds = task_tree[deployed_version]["tests"][test]["rounds"]
@@ -678,8 +694,8 @@ class SmartEzbench:
                     for key in result.results():
                         full_name = Test.partial_name(result.test.full_name, [key])
                         SmartEzbench.__remove_task_from_tasktree__(task_tree, commit.full_sha1, full_name, len(result.result(key)))
-                        # HACK: Remove this when all the new reports use the full_sha1 for storage
-                        SmartEzbench.__remove_task_from_tasktree__(task_tree, commit.sha1, full_name, len(result.result(key)))
+
+            resumable_tasks = report.journal.incomplete_tests()
 
             # Delete the tests on commits that do not compile
             for commit in report.commits:
@@ -696,7 +712,7 @@ class SmartEzbench:
             pass
 
         # Return the result
-        q.put((exit_code, task_tree, events_str))
+        q.put((exit_code, task_tree, events_str, resumable_tasks))
 
     def run(self):
         self.__log(Criticality.II, "----------------------")
@@ -731,7 +747,7 @@ class SmartEzbench:
         p = multiprocessing.Process(target=SmartEzbench.__generate_task_and_events_list__,
                                     args=(q, self.state, self.log_folder, self.repo()))
         p.start()
-        exit_code, task_tree, self._events_str = q.get()
+        exit_code, task_tree, self._events_str, resumable_tasks = q.get()
         p.join()
 
         if len(task_tree) == 0:
@@ -740,13 +756,14 @@ class SmartEzbench:
 
         task_tree_str = pprint.pformat(task_tree)
         self.__log(Criticality.II, "Task list: {tsk_str}".format(tsk_str=task_tree_str))
+        self.__log(Criticality.II, "Incomplete runs: {}".format(resumable_tasks))
 
         # Lock the report for further changes (like for profiles)
         self.__write_attribute__('beenRunBefore', True)
 
         # Prioritize --> return a list of commits to do in order
         self._task_lock.acquire()
-        self._task_list = self.__prioritize_runs(task_tree, deployed_commit)
+        self._task_list = self.__prioritize_runs(task_tree, deployed_commit, resumable_tasks)
 
         # Call the hook file, telling we started running
         self.__call_hook__('start_running_tests')
@@ -763,7 +780,7 @@ class SmartEzbench:
                 self.__done_running__(runner)
                 return False
 
-            self._task_current = e = self._task_list.pop(0)
+            self._task_current = e = self._task_list.popleft()
             short_name=e.test[:80].rsplit('|', 1)[0]+'...'
             self.__log(Criticality.DD,
                        "make {count} runs for test {test} using commit {commit}".format(count=e.rounds,
@@ -784,21 +801,40 @@ class SmartEzbench:
             self._task_current.started()
             for r in range(0, e.rounds):
                 self._task_lock.release()
+
                 try:
-                    time, cmd_output = runner.run(e.commit, e.test, False)
+                    if e.resumeResultFile is not None:
+                        time, cmd_output = runner.resume(e.commit, e.test, e.resumeResultFile, False)
+                    else:
+                        time, cmd_output = runner.run(e.commit, e.test, False)
                 except RunnerError as error:
                     err_code = error.args[0]['err_code']
+                    err_str = error.args[0]['err_str']
                     # We got an error, let's see what we can do about it!
-                    if (err_code.value != RunnerErrorCode.NO_ERROR and
-                        err_code.value < RunnerErrorCode.COMP_DEP_UNK_ERROR.value):
+                    if (err_code == RunnerErrorCode.CMD_TEST_EXEC_TYPE_UNSUPPORTED or
+                        err_code == RunnerErrorCode.CMD_TEST_EXEC_TYPE_NEED_VALID_RESULT_FILE or
+                        err_code == RunnerErrorCode.CMD_RESULT_ALREADY_COMPLETE):
+                        # The resume failed, nothing to do
+                        pass
+                    elif err_code == RunnerErrorCode.REBOOT_NEEDED:
+                        # TODO: have some sort of hooks here to warn the rest of the world
+                        # that we are about to reboot
+                        self._task_list = []
+                        self._task_current = None
+                        self.__log(Criticality.II, "Rebooting...")
+                        runner.reboot()
+                        sys.exit(0)
+                    elif (err_code.value != RunnerErrorCode.NO_ERROR and
+                          err_code.value < RunnerErrorCode.COMP_DEP_UNK_ERROR.value):
                         # Error we cannot do anything about, probably a setup issue
                         # Let's mark the run as aborted until the user resets it!
-                        self.__log(Criticality.EE, "The run returned the error {}".format(err_code))
+                        self.__log(Criticality.EE,  error.args[0]['err_code'])
                         self.set_running_mode(RunningMode.ERROR)
                     elif (err_code == RunnerErrorCode.COMPILATION_FAILED or
-                        err_code == RunnerErrorCode.DEPLOYMENT_FAILED):
+                          err_code == RunnerErrorCode.DEPLOYMENT_FAILED):
                         # Cancel any other test on this commit
                         self._task_list = [x for x in self._task_list if not x.commit == e.commit]
+
                 self._task_lock.acquire()
 
         self._task_current = None
-- 
2.9.3