[Ezbench-dev] [PATCH 14/15] smartezbench: resume incomplete runs, if possible

Mon Jan 30 20:54:12 UTC 2017

---
 python-modules/ezbench/smartezbench.py | 71 +++++++++++++++++++++++++---------
 1 file changed, 52 insertions(+), 19 deletions(-)

diff --git a/python-modules/ezbench/smartezbench.py b/python-modules/ezbench/smartezbench.py
index 5f3474b..0355a07 100644
--- a/python-modules/ezbench/smartezbench.py
+++ b/python-modules/ezbench/smartezbench.py
@@ -653,6 +653,7 @@ class SmartEzbench:
         exit_code = 1
         task_tree = list()
         events_str = []
+        resumable_tasks = []
 
         # Make sure we catch *any* error, because we need to send stuff in the
         # Queue if we do not want the parent process to get stuck
@@ -678,8 +679,6 @@ class SmartEzbench:
                     for key in result.results():
                         full_name = Test.partial_name(result.test.full_name, [key])
                         SmartEzbench.__remove_task_from_tasktree__(task_tree, commit.full_sha1, full_name, len(result.result(key)))
-                        # HACK: Remove this when all the new reports use the full_sha1 for storage
-                        SmartEzbench.__remove_task_from_tasktree__(task_tree, commit.sha1, full_name, len(result.result(key)))
 
             # Delete the tests on commits that do not compile
             for commit in report.commits:
@@ -690,13 +689,21 @@ class SmartEzbench:
                         del task_tree[commit.sha1]
 
             exit_code = 0
+            resumable_tasks = report.journal.incomplete_tests()
         except Exception as e:
             traceback.print_exc(file=sys.stderr)
             sys.stderr.write("\n")
             pass
 
         # Return the result
-        q.put((exit_code, task_tree, events_str))
+        q.put((exit_code, task_tree, events_str, resumable_tasks))
+
+    def pop_from_resumable_tasks(self, resumable_tasks, version, test):
+        for i in range(0, len(resumable_tasks)):
+            if resumable_tasks[i]["version"] == version and resumable_tasks[i]["test"] == test:
+                task = resumable_tasks.pop(i)
+                return task.get("result_file", None)
+        return None
 
     def run(self):
         self.__log(Criticality.II, "----------------------")
@@ -731,7 +738,7 @@ class SmartEzbench:
         p = multiprocessing.Process(target=SmartEzbench.__generate_task_and_events_list__,
                                     args=(q, self.state, self.log_folder, self.repo()))
         p.start()
-        exit_code, task_tree, self._events_str = q.get()
+        exit_code, task_tree, self._events_str, resumable_tasks = q.get()
         p.join()
 
         if len(task_tree) == 0:
@@ -740,6 +747,7 @@ class SmartEzbench:
 
         task_tree_str = pprint.pformat(task_tree)
         self.__log(Criticality.II, "Task list: {tsk_str}".format(tsk_str=task_tree_str))
+        self.__log(Criticality.II, "Incomplete runs: {}".format(resumable_tasks))
 
         # Lock the report for further changes (like for profiles)
         self.__write_attribute__('beenRunBefore', True)
@@ -784,21 +792,46 @@ class SmartEzbench:
             self._task_current.started()
             for r in range(0, e.rounds):
                 self._task_lock.release()
-                try:
-                    time, cmd_output = runner.run(e.commit, e.test, False)
-                except RunnerError as error:
-                    err_code = error.args[0]['err_code']
-                    # We got an error, let's see what we can do about it!
-                    if (err_code.value != RunnerErrorCode.NO_ERROR and
-                        err_code.value < RunnerErrorCode.COMP_DEP_UNK_ERROR.value):
-                        # Error we cannot do anything about, probably a setup issue
-                        # Let's mark the run as aborted until the user resets it!
-                        self.__log(Criticality.EE, "The run returned the error {}".format(err_code))
-                        self.set_running_mode(RunningMode.ERROR)
-                    elif (err_code == RunnerErrorCode.COMPILATION_FAILED or
-                        err_code == RunnerErrorCode.DEPLOYMENT_FAILED):
-                        # Cancel any other test on this commit
-                        self._task_list = [x for x in self._task_list if not x.commit == e.commit]
+
+                # Try to resume tasks before trying to add a new run
+                while True:
+                    try:
+                        result_file = self.pop_from_resumable_tasks(resumable_tasks, e.commit, e.test)
+                        if result_file is not None:
+                            time, cmd_output = runner.resume(e.commit, e.test, result_file, False)
+                        else:
+                            time, cmd_output = runner.run(e.commit, e.test, False)
+                    except RunnerError as error:
+                        err_code = error.args[0]['err_code']
+                        err_str = error.args[0]['err_str']
+                        # We got an error, let's see what we can do about it!
+                        if (err_code == RunnerErrorCode.CMD_TEST_EXEC_TYPE_UNSUPPORTED or
+                            err_code == RunnerErrorCode.CMD_TEST_EXEC_TYPE_NEED_VALID_RESULT_FILE or
+                            err_code == RunnerErrorCode.CMD_RESULT_ALREADY_COMPLETE):
+                            # The resume failed, try all the other ones until we try a normal run
+                            continue
+                        elif err_code == RunnerErrorCode.REBOOT_NEEDED:
+                            # TODO: have some sort of hooks here to warn the rest of the world
+                            # that we are about to reboot
+                            self._task_list = []
+                            self._task_current = None
+                            self.__log(Criticality.II, "Rebooting...")
+                            runner.reboot()
+                            sys.exit(0)
+                        elif (err_code.value != RunnerErrorCode.NO_ERROR and
+                            err_code.value < RunnerErrorCode.COMP_DEP_UNK_ERROR.value):
+                            # Error we cannot do anything about, probably a setup issue
+                            # Let's mark the run as aborted until the user resets it!
+                            self.__log(Criticality.EE,  error.args[0]['err_code'])
+                            self.set_running_mode(RunningMode.ERROR)
+                        elif (err_code == RunnerErrorCode.COMPILATION_FAILED or
+                            err_code == RunnerErrorCode.DEPLOYMENT_FAILED):
+                            # Cancel any other test on this commit
+                            self._task_list = [x for x in self._task_list if not x.commit == e.commit]
+
+                    # Loop only if a resume failed
+                    break
+
                 self._task_lock.acquire()
 
         self._task_current = None
-- 
2.11.0