[Piglit] [PATCH] Add dmesg option for reboot policy

Tue Nov 24 04:14:23 PST 2015

From: Yann Argotti <yann.argotti at linux.intel.com>
Date: Tue, 24 Nov 2015 12:16:34 +0100

 This adds a policy which advises when user should reboot system to avoid
 noisy test results due to system becoming unstable, for instance, and
 therefore continues testing successfully. To do this, a new Dmesg class is
 proposed which is not filtering dmesg and monitors whether or not one of
 the following event occurs:
  - gpu reset failed (not just gpu reset happened, that happens
 way too often and many tests even provoke hangs intentionally)  - gpu crash,
 - Oops:  - BUG  - lockdep splat that causes the locking validator to get
 disabled If one of these issues happen, piglit test execution is stopped
 -terminating test thread pool- and exit with code 3 to inform that reboot is
 advised. Then test execution resume, after rebooting system or not, is done
 like usually with command line parameter "resume".

 To call it, use command line parameter: --dmesg monitored
 (this is original patch for review and therefore didn't apply any proposed
 change yet)

Signed-off-by: Yann Argotti <yann.argotti at linux.intel.com>
---
 framework/dmesg.py        | 118
+++++++++++++++++++++++++++++++++++++++++++++-
 framework/exceptions.py   |   6 ++-
 framework/profile.py      |  35 ++++++++++++--
 framework/programs/run.py |  45 ++++++++++++++----
 4 files changed, 185 insertions(+), 19 deletions(-)

diff --git a/framework/dmesg.py b/framework/dmesg.py
index daeeeae..66c95f5 100644
--- a/framework/dmesg.py
+++ b/framework/dmesg.py
@@ -46,6 +46,7 @@ import gzip
 __all__ = [
     'BaseDmesg',
     'LinuxDmesg',
+    'LinuxMonitoredDmesg',
     'DummyDmesg',
     'get_dmesg',
 ]
@@ -212,6 +213,116 @@ class LinuxDmesg(BaseDmesg):
         # Attempt to store the last element of dmesg, unless there was no
dmesg
         self._last_message = dmesg[-1] if dmesg else None

+class LinuxMonitoredDmesg(BaseDmesg):
+    """ Read and monitor dmesg on posix systems
+
+    This reads the dmesg ring buffer, stores the contents of the buffer, and
+    then compares it whenever called, returning the difference between the
+    calls. It requires timestamps to be enabled.
+
+    """
+    # We are considering all levels of dmesg to avoid missing
+    # any important ones and with an inaccurate reporting level
+    DMESG_COMMAND = ['dmesg']
+    # With _first_dmesg_process, we are not checking dmesg at test execution
+    # starting time: user must know current machine/GPU state before
+    # starting piglit: we can resume test execution based on last stopping
+    # if we consider that our system is still in good shape (ie not blocking
+    # run forever)
+    _first_dmesg_process = True
+    _reboot_needed = False
+
+    def __init__(self):
+        """ Create a dmesg instance """
+        # _last_message store the contents of the last entry in dmesg the
last
+        # time it was read. Because dmesg is a ringbuffer old entries can
fall
+        # off the end if it becomes too full. To track new entries search
for
+        # this entry and take any entries that were added after it. This
works
+        # on Linux because each entry is guaranteed unique by timestamps.
+        self._last_message = None
+        super(LinuxMonitoredDmesg, self).__init__()
+        self._reboot_needed = False
+
+        if not self._last_message:
+            # We need to ensure that there is something in dmesg to check
+            # timestamps.
+            warnings.warn("Cannot check dmesg for timestamp support. If
you "
+                          "do not have timestamps enabled in your kernel
you "
+                          "get incomplete dmesg captures", RuntimeWarning)
+        elif not re.match(r'\[\s*\d+\.\d+\]', self._last_message):
+            # Do an initial check to ensure that dmesg has timestamps, we
need
+            # timestamps to work correctly. A proper linux dmesg timestamp
+            # looks like this: [    0.00000]
+            raise DmesgError(
+                "Your kernel does not seem to support timestamps, which "
+                "are required by the --dmesg option")
+
+    def reboot_is_needed(self):
+        """ Simply return the reboot_needed variable state """
+        return self._reboot_needed
+
+    def update_dmesg(self):
+        """ Call dmesg using subprocess.check_output
+
+        Get the contents of dmesg, then calculate new messages, finally set
+        self.dmesg equal to the just read contents of dmesg.
+
+        """
+        dmesg =
subprocess.check_output(self.DMESG_COMMAND).strip().splitlines()
+
+        # Find all new entries, do this by slicing the list of dmesg to only
+        # returns elements after the last element stored. If there are not
+        # matches a value error is raised, that means all of dmesg is new
+        l = 0
+        for index, item in enumerate(reversed(dmesg)):
+            if item == self._last_message:
+                l = len(dmesg) - index  # don't include the matched element
+                break
+        self._new_messages = dmesg[l:]
+
+        if (not self._first_dmesg_process):
+            if (any("[drm] Failed to reset chip" in msg for msg in
self._new_messages) or
+                any("[drm] GPU crash" in msg for msg in
self._new_messages) or
+                any("Oops:" in msg for msg in self._new_messages) or
+                any("BUG:" in msg for msg in self._new_messages) or
+                any("turning off the locking correctness validator" in
msg for msg in self._new_messages)):
+               self._reboot_needed = True
+        else:
+           self._first_dmesg_process = False
+
+        # Attempt to store the last element of dmesg, unless there was no
dmesg
+        self._last_message = dmesg[-1] if dmesg else None
+
+    def update_result(self, result):
+        """ Takes a TestResult object and updates it with dmesg statuses
+
+        There is no filtering/change of status in dmesg and return the
+        original result passed in.
+
+        Arguments:
+        result -- A TestResult instance
+
+        """
+
+        # Get a new snapshot of dmesg
+        self.update_dmesg()
+
+        # if update_dmesg() found new entries replace the results of the
test
+        # and subtests
+        if self._new_messages:
+
+            if self.regex:
+                for line in self._new_messages:
+                    if self.regex.search(line):
+                        break
+                else:
+                    return result
+
+            # Add the dmesg values to the result
+            result.dmesg = "\n".join(self._new_messages)
+
+        return result
+

 class DummyDmesg(BaseDmesg):
     """ An dummy class for dmesg on non unix-like systems
@@ -239,7 +350,7 @@ class DmesgError(EnvironmentError):
     pass


-def get_dmesg(not_dummy=True):
+def get_dmesg(not_dummy=True, monitored=False):
     """ Return a Dmesg type instance

     Normally this does a system check, and returns the type that proper for
@@ -248,5 +359,8 @@ def get_dmesg(not_dummy=True):

     """
     if sys.platform.startswith('linux') and not_dummy:
-        return LinuxDmesg()
+        if monitored:
+            return LinuxMonitoredDmesg()
+        else:
+            return LinuxDmesg()
     return DummyDmesg()
diff --git a/framework/exceptions.py b/framework/exceptions.py
index fdd1f1f..c6adfc3 100644
--- a/framework/exceptions.py
+++ b/framework/exceptions.py
@@ -46,8 +46,9 @@ def handler(func):

     @functools.wraps(func)
     def _inner(*args, **kwargs):
-        try:
-            func(*args, **kwargs)
+       returncode = 0
+       try:
+            returncode = func(*args, **kwargs)
         except PiglitFatalError as e:
             print('Fatal Error: {}'.format(str(e)), file=sys.stderr)
             sys.exit(1)
@@ -67,6 +68,7 @@ def handler(func):
             if _DEBUG:
                 raise
             sys.exit(1)
+        return returncode

     return _inner

diff --git a/framework/profile.py b/framework/profile.py
index 32ed759..165d043 100644
--- a/framework/profile.py
+++ b/framework/profile.py
@@ -34,6 +34,8 @@ import importlib
 import contextlib
 import itertools

+from functools import partial
+
 from framework import grouptools, exceptions, options
 from framework.dmesg import get_dmesg
 from framework.log import LogManager
@@ -165,15 +167,25 @@ class TestProfile(object):
         return self._dmesg

     @dmesg.setter
-    def dmesg(self, not_dummy):
+    def dmesg(self, vals):
         """ Set dmesg

-        Arguments:
+        Arguments: (contain in vals)
+
         not_dummy -- if Truthy dmesg will try to get a PosixDmesg, if
Falsy it
                      will get a DummyDmesg
+        monitored -- if True, invoke also that dmesg are monitored to
stop test
+                     execution and advise to reboot (exit code 3)
+
+        if vals is not iterable, it assume this is a signle value

         """
-        self._dmesg = get_dmesg(not_dummy)
+        try:
+             not_dummy, monitored = vals
+        except TypeError, ValueError:
+             not_dummy = vals
+             monitored = False
+        self._dmesg = get_dmesg(not_dummy, monitored)

     def _prepare_test_list(self):
         """ Prepare tests for running
@@ -253,16 +265,19 @@ class TestProfile(object):
         self._prepare_test_list()
         log = LogManager(logger, len(self.test_list))

-        def test(pair):
+        def test(pair, this_pool=None):
             """Function to call test.execute from map"""
             name, test = pair
             with backend.write_test(name) as w:
                 test.execute(name, log.get(), self.dmesg)
                 w(test.result)
+            # in case current dmesg is supporting this property, check it
+            if self.dmesg.reboot_is_needed():
+                this_pool.terminate()

         def run_threads(pool, testlist):
             """ Open a pool, close it, and join it """
-            pool.imap(test, testlist, chunksize)
+            pool.imap(partial(test, this_pool=pool), testlist, chunksize)
             pool.close()
             pool.join()

@@ -289,6 +304,16 @@ class TestProfile(object):

         self._post_run_hook()

+        # return code 3 is to say a reboot is required
+        try:
+            if self.dmesg.reboot_is_needed():
+               return 3
+        except AttributeError:
+            pass
+
+        return 0
+
+
     def filter_tests(self, function):
         """Filter out tests that return false from the supplied function

diff --git a/framework/programs/run.py b/framework/programs/run.py
index 328e0b0..f4f82dd 100644
--- a/framework/programs/run.py
+++ b/framework/programs/run.py
@@ -137,9 +137,17 @@ def _run_parser(input_):
                         action="store_true",
                         help="Run tests in valgrind's memcheck")
     parser.add_argument("--dmesg",
-                        action="store_true",
-                        help="Capture a difference in dmesg before and "
-                             "after each test. Implies -1/--no-concurrency")
+                        dest="dmesg",
+                        action="store",
+                        choices=['none', 'simple', 'monitored'],
+                        default='none',
+                        help="Set level of dmesg usage: "
+                             "none={no dmesg use - default}, "
+                             "simple={Capture a difference in dmesg
before and "
+                             "after each test. Implies
-1/--no-concurrency}, "
+                             "monitored={simple mode + monitor dmesg and
stop "
+                             "test execution when a reboot may be required "
+                             "-exit code 3}")
     parser.add_argument("-s", "--sync",
                         action="store_true",
                         help="Sync results to disk after every test")
@@ -223,7 +231,7 @@ def run(input_):

     # If dmesg is requested we must have serial run, this is becasue dmesg
     # isn't reliable with threaded run
-    if args.dmesg:
+    if args.dmesg != 'none':
         args.concurrency = "none"

     # build up the include filter based on test_list
@@ -268,9 +276,16 @@ def run(input_):

     results.time_elapsed.start = time.time()
     # Set the dmesg type
-    if args.dmesg:
-        profile.dmesg = args.dmesg
-    profile.run(args.log_level, backend)
+    if args.dmesg != 'none':
+        profile.dmesg = (True, args.dmesg=='monitored')
+
+    returncode = profile.run(args.log_level, backend)
+    # in case something went wrong (returncode is not 0)
+    if returncode != 0:
+       print('Aborting Piglit execution: code (', returncode,')')
+       if returncode == 3:
+           print('~> Advising to reboot\n')
+       return returncode

     results.time_elapsed.end = time.time()
     backend.finalize({'time_elapsed': results.time_elapsed})
@@ -278,6 +293,8 @@ def run(input_):
     print('Thank you for running Piglit!\n'
           'Results have been written to ' + args.results_path)

+    return 0
+

 @exceptions.handler
 def resume(input_):
@@ -328,13 +345,21 @@ def resume(input_):

     profile =
framework.profile.merge_test_profiles(results.options['profile'])
     profile.results_dir = args.results_path
-    if options.OPTIONS.dmesg:
-        profile.dmesg = options.OPTIONS.dmesg
+    if options.OPTIONS.dmesg != 'none':
+        profile.dmesg = (True, options.OPTIONS.dmesg=='monitored')

     # This is resumed, don't bother with time since it wont be accurate
anyway
-    profile.run(results.options['log_level'], backend)
+    returncode = profile.run(results.options['log_level'], backend)
+    # in case something went wrong (returncode is not 0)
+    if returncode != 0:
+       print('Aborting Piglit execution: code (', returncode,')')
+       if returncode == 3:
+           print('~> Advising to reboot\n')
+       return returncode

     backend.finalize()

     print("Thank you for running Piglit!\n"
           "Results have been written to {0}".format(args.results_path))
+
+    return 0
--
1.9.1