[Piglit] [PATCH][RFC] Add aborting option when a monitored error is detected

Thu Mar 17 17:44:04 UTC 2016

This adds a policy which advises when the user should reboot the system
to avoid noisy test results due to system becoming unstable, for
instance, and therefore continues testing successfully.

To do this, a new module is proposed. A class Monitoring is used for
managing the monitoring rules. Two types of rules, MonitoringFile and
MonitoringLinuxDmesg, derived from the abstract class MonitoringBase,
have been implemented. The first allow to track a pattern on standard
files or device files. The second, derived from dmesg.LinuxDmesg, will
track a pattern on the dmesg.

The monitoring rules must be defined in piglit.conf at the section
monitored-errors. If one of the regex is found, Piglit will store the
cause in the TestResult object, raise a PiglitAbortException exception,
stop the test execution -terminating test thread pool- and exit with
code 3.

Then test execution resume, after rebooting the system or not, is done
like usually with command line parameter "resume".

To call it, use command line parameter: --abort-on-monitored-error

This option implies --no-concurrent
---
 framework/exceptions.py   |  13 +++
 framework/monitoring.py   | 266 ++++++++++++++++++++++++++++++++++++++++++++++
 framework/options.py      |   2 +
 framework/profile.py      |  32 +++++-
 framework/programs/run.py |  18 +++-
 framework/results.py      |   9 +-
 framework/test/base.py    |   5 +-
 piglit.conf.example       |  24 +++++
 8 files changed, 362 insertions(+), 7 deletions(-)
 create mode 100644 framework/monitoring.py

diff --git a/framework/exceptions.py b/framework/exceptions.py
index 6e87bee..8e8e9a0 100644
--- a/framework/exceptions.py
+++ b/framework/exceptions.py
@@ -30,6 +30,7 @@ __all__ = [
     'PiglitInternalError',
     'PiglitFatalError',
     'PiglitException',
+    'PiglitAbortException',
     'handler',
 ]
 
@@ -51,6 +52,9 @@ def handler(func):
         except PiglitFatalError as e:
             print('Fatal Error: {}'.format(str(e)), file=sys.stderr)
             sys.exit(1)
+        except PiglitAbortException as e:
+            print('Aborting Piglit execution: {}'.format(str(e)), file=sys.stderr)
+            sys.exit(3)
 
     return _inner
 
@@ -87,3 +91,12 @@ class PiglitFatalError(Exception):
     to the top of the program where it exits.
 
     """
+
+
+class PiglitAbortException(Exception):
+    """Class for non-errors that require piglit aborting.
+
+    When this class (or a subclass) is raised it should be raised all the way
+    to the top of the program where it exits.
+
+    """
diff --git a/framework/monitoring.py b/framework/monitoring.py
new file mode 100644
index 0000000..65f1cc9
--- /dev/null
+++ b/framework/monitoring.py
@@ -0,0 +1,266 @@
+# Copyright (c) 2016 Intel Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+""" Module implementing classes for monitoring
+
+This module provides a set of classes for monitoring the system on multiple
+sources. The sources can be standard files, device files or the dmesg.
+The patterns of the errors are defined in piglit.conf, at the section
+monitored-errors.
+
+When one of the regex is found in the corresponding source, the message is
+added to TestResult and abort Piglit with code 3.
+
+"""
+
+import abc
+import re
+import sys
+import ast
+import os
+import fcntl
+import errno
+
+import six
+
+from framework import exceptions, core
+from framework.dmesg import LinuxDmesg
+
+__all__ = [
+    'LinuxMonitoringDmesg',
+    'LinuxMonitoringFile',
+    'BaseMonitoring',
+    'Monitoring',
+]
+
+
+
+class Monitoring():
+    """ Class that initialize and update monitoring classes
+
+    This reads the rules from the section monitored-errors in piglit.conf
+    and initializes the monitoring objects. Their type must be derived from
+    BaseMonitoring and specialized according the field 'type'.
+
+    """
+
+    # starting time: user must know current machine/GPU state before
+    # starting piglit: we can resume test execution based on last stopping
+    # if we consider that our system is still in good shape (ie not blocking
+    # run forever)
+    _abort_error = None
+    _monitoring_rules = None
+
+    def __init__(self, monitoring_enabled):
+        """ Create a LinuxMonitored instance """
+        # Get the monitoring rules from piglit.conf and store them into a dict.
+        self._monitoring_rules = {}
+        if monitoring_enabled:
+            for (key, value) in core.PIGLIT_CONFIG.items('monitored-errors'):
+                params = ast.literal_eval(value)
+                rule = None
+                if params['type'] == 'file':
+                    rule = MonitoringFile(params['parameters'],
+                                          params['regex'])
+                elif params['type'] == 'device':
+                    rule = MonitoringFile(params['parameters'],
+                                          params['regex'], True)
+                elif params['type'] == 'dmesg':
+                    rule = MonitoringLinuxDmesg(params['parameters'],
+                                                params['regex'])
+                self._monitoring_rules[key] = rule
+
+    @property
+    def abort_needed(self):
+        """ simply return if _abort_error variable is not empty
+
+        """
+        return self._abort_error is not None
+
+    def update_monitoring(self):
+        """  Update the new messages for each monitoring object
+
+        """
+        if self._monitoring_rules:
+            for (key, monitoring_rule) in self._monitoring_rules.items():
+                monitoring_rule.update_monitoring()
+
+    def update_result(self, result):
+        """ Check monitoring objects statue
+
+        This method checks the state for each monitoring object.
+        If one of them found the pattern in the new messages,
+        set itself on abort_needed state and store the error message
+        in the result.
+
+        Arguments:
+        result -- A TestResult instance
+
+        """
+        # Get a new snapshot of the source
+        self.update_monitoring()
+
+        if self._monitoring_rules:
+            for (key, monitoring_rule) in self._monitoring_rules.items():
+                # Get error message
+                self._abort_error = monitoring_rule.check_monitoring()
+                # if error message is not empty, abort is requested
+                if self.abort_needed:
+                    # Save from which monitoring rule the error come
+                    result.abort_error = "\n".join(
+                        monitoring_rule._new_messages)
+                    result.abort_error_source = key
+                    break
+
+
+        return result
+
+ at six.add_metaclass(abc.ABCMeta)
+class BaseMonitoring(object):
+    """ Abstract base class for Monitoring derived objects
+
+    This provides the bases of the constructor and most subclasses should call
+    super() in __init__(). It provides a conctret implementation of the
+    check_monitoring() method, which should be suitible for all subclasses.
+
+    The update_monitoring() method need to be override for all subclasses.
+
+    """
+    @abc.abstractmethod
+    def __init__(self, monitoring_source, regex):
+        """ Abstract constructor for BaseMonitoring subclasses
+
+        Arguments;
+        monitoring_source -- The source to monitor
+        regex -- The errors pattern
+
+        """
+        self._monitoring_source = monitoring_source
+        self._monitoring_regex = re.compile(regex)
+
+    @abc.abstractmethod
+    def update_monitoring(self):
+        """ Update _new_messages list
+
+        This method should read a monitoring source like a file or a program
+        output, determine new entries since the last read of the source,
+        and update self._new_messages with those entries
+
+        """
+        pass
+
+    def check_monitoring(self):
+        """ Check _new_messages
+
+        This method checks if the regex is found in the new messages,
+        then return a string that contain the monitored location and
+        the matched line
+
+        """
+        if self._new_messages and self._monitoring_regex:
+            for line in self._new_messages:
+                if self._monitoring_regex.search(line):
+                    return "From {}:\n{}".format(self._monitoring_source,line)
+
+
+class MonitoringFile(BaseMonitoring):
+    """ Monitoring from a file
+
+    This class is for monitoring the system from a file that
+    can be a standard file or a device file
+
+    Arguments:
+    isDevice -- True if the target is a device file
+
+    """
+    def __init__(self, monitoring_source, regex, isDevice=False):
+        """ Create a MonitoringFile instance """
+        self._last_message = None
+        self._isDevice = isDevice
+        super(MonitoringFile, self).__init__(monitoring_source, regex)
+
+    def update_monitoring(self):
+        """ Open the file and get the differences
+
+        Get the contents of the file, then calculate new messages.
+        This implements also a specific method for reading device files.
+
+        """
+        try:
+            with open(self._monitoring_source, 'r') as f:
+                lines = []
+                if self._isDevice:
+                    # Create a duplicated file descriptor, this avoid lock
+                    fd = os.dup(f.fileno())
+                    f.close()
+                    # use I/O control for reading the lines
+                    fcntl.fcntl(fd, fcntl.F_SETFL, os.O_NONBLOCK)
+                    while True:
+                        try:
+                            line = os.read(fd, 1024)
+                        except OSError as e:
+                            if e.errno == errno.EAGAIN:
+                                break
+                            else:
+                                raise e
+
+                        lines.append(line.decode("utf-8", "strict"))
+                    os.close(fd)
+
+                else:
+                    lines = f.read().splitlines()
+
+                # Find all new entries, do this by slicing the list of the lines to only
+                # returns elements after the last element stored. If there are not
+                # matches a value error is raised, that means all of the lines are new
+                l = 0
+                for index, item in enumerate(reversed(lines)):
+                    if item == self._last_message:
+                        l = len(lines) - index  # don't include the matched element
+                        break
+                self._new_messages = lines[l:]
+                # Attempt to store the last element of lines,
+                # unless there was no line
+                self._last_message = lines[-1] if lines else None
+        except Exception:
+            # if an error occured, we consider there are no new messages
+            self._new_messages = []
+            pass
+
+
+class MonitoringLinuxDmesg(BaseMonitoring, LinuxDmesg):
+    """ Monitoring on dmesg
+
+    This class is for monitoring on the system dmesg. It's inherited
+    from LinuxDmesg for the dmesg processing methods.
+
+    Work only on Linux operating system.
+
+    """
+    def __init__(self, monitoring_source, regex):
+        """ Create a MonitoringLinuxDmesg instance """
+        self.DMESG_COMMAND = monitoring_source.split()
+        BaseMonitoring.__init__(self, monitoring_source, regex)
+        LinuxDmesg.__init__(self)
+
+    def update_monitoring(self):
+        """ Call update_dmesg """
+        self.update_dmesg()
diff --git a/framework/options.py b/framework/options.py
index cf49520..f6fb141 100644
--- a/framework/options.py
+++ b/framework/options.py
@@ -178,6 +178,7 @@ class _Options(object):  # pylint: disable=too-many-instance-attributes
     exclude_filter -- list of compiled regex which exclude tests that match
     valgrind -- True if valgrind is to be used
     dmesg -- True if dmesg checking is desired. This forces concurrency off
+    monitored -- True if monitoring is desired. This forces concurrency off
     env -- environment variables set for each test before run
 
     """
@@ -193,6 +194,7 @@ class _Options(object):  # pylint: disable=too-many-instance-attributes
         self.valgrind = False
         self.dmesg = False
         self.sync = False
+        self.monitored = False
 
         # env is used to set some base environment variables that are not going
         # to change across runs, without sending them to os.environ which is
diff --git a/framework/profile.py b/framework/profile.py
index fc38c56..d385db6 100644
--- a/framework/profile.py
+++ b/framework/profile.py
@@ -35,6 +35,7 @@ import multiprocessing.dummy
 import importlib
 import contextlib
 import itertools
+from functools import partial
 
 import six
 
@@ -42,6 +43,7 @@ from framework import grouptools, exceptions, options
 from framework.dmesg import get_dmesg
 from framework.log import LogManager
 from framework.test.base import Test
+from framework.monitoring import Monitoring
 
 __all__ = [
     'TestProfile',
@@ -162,6 +164,8 @@ class TestProfile(object):
         self._dmesg = None
         self.dmesg = False
         self.results_dir = None
+        self._monitoring = None
+        self.monitoring = False
 
     @property
     def dmesg(self):
@@ -179,6 +183,22 @@ class TestProfile(object):
         """
         self._dmesg = get_dmesg(not_dummy)
 
+    @property
+    def monitoring(self):
+        """ Return monitoring """
+        return self._monitoring
+
+    @monitoring.setter
+    def monitoring(self, monitored):
+        """ Set monitoring
+
+        Arguments:
+        monitored -- if Truthy Monioring will enable monitoring according the
+                     defined rules
+
+        """
+        self._monitoring = Monitoring(monitored)
+
     def _prepare_test_list(self):
         """ Prepare tests for running
 
@@ -257,16 +277,19 @@ class TestProfile(object):
         self._prepare_test_list()
         log = LogManager(logger, len(self.test_list))
 
-        def test(pair):
+        def test(pair, this_pool=None):
             """Function to call test.execute from map"""
             name, test = pair
             with backend.write_test(name) as w:
-                test.execute(name, log.get(), self.dmesg)
+                test.execute(name, log.get(), self.dmesg, self.monitoring)
                 w(test.result)
+            # in case current dmesg is supporting this property, check it
+            if self._monitoring.abort_needed:
+                this_pool.terminate()
 
         def run_threads(pool, testlist):
             """ Open a pool, close it, and join it """
-            pool.imap(test, testlist, chunksize)
+            pool.imap(partial(test, this_pool=pool), testlist, chunksize)
             pool.close()
             pool.join()
 
@@ -293,6 +316,9 @@ class TestProfile(object):
 
         self._post_run_hook()
 
+        if self._monitoring.abort_needed:
+            raise exceptions.PiglitAbortException(self._monitoring._abort_error)
+
     def filter_tests(self, function):
         """Filter out tests that return false from the supplied function
 
diff --git a/framework/programs/run.py b/framework/programs/run.py
index 49f7b11..2920a7f 100644
--- a/framework/programs/run.py
+++ b/framework/programs/run.py
@@ -136,6 +136,13 @@ def _run_parser(input_):
                         action="store_true",
                         help="Capture a difference in dmesg before and "
                              "after each test. Implies -1/--no-concurrency")
+    parser.add_argument("--abort-on-monitored-error",
+                        action="store_true",
+                        dest="monitored",
+                        help="Enable monitoring according the rules defined "
+                        "in piglit.conf, and stop the execution when a"
+                        "monitored error is detected. Exit code 3."
+                        " Implies -1/--no-concurrency")
     parser.add_argument("-s", "--sync",
                         action="store_true",
                         help="Sync results to disk after every test")
@@ -219,7 +226,7 @@ def run(input_):
 
     # If dmesg is requested we must have serial run, this is becasue dmesg
     # isn't reliable with threaded run
-    if args.dmesg:
+    if args.dmesg or args.monitored:
         args.concurrency = "none"
 
     # build up the include filter based on test_list
@@ -236,6 +243,7 @@ def run(input_):
     options.OPTIONS.valgrind = args.valgrind
     options.OPTIONS.dmesg = args.dmesg
     options.OPTIONS.sync = args.sync
+    options.OPTIONS.monitored = args.monitored
 
     # Set the platform to pass to waffle
     options.OPTIONS.env['PIGLIT_PLATFORM'] = args.platform
@@ -266,6 +274,10 @@ def run(input_):
     # Set the dmesg type
     if args.dmesg:
         profile.dmesg = args.dmesg
+
+    if args.monitored:
+        profile.monitoring = args.monitored
+
     profile.run(args.log_level, backend)
 
     results.time_elapsed.end = time.time()
@@ -302,6 +314,7 @@ def resume(input_):
     options.OPTIONS.valgrind = results.options['valgrind']
     options.OPTIONS.dmesg = results.options['dmesg']
     options.OPTIONS.sync = results.options['sync']
+    options.OPTIONS.monitored = results.options['monitored']
 
     core.get_config(args.config_file)
 
@@ -327,6 +340,9 @@ def resume(input_):
     if options.OPTIONS.dmesg:
         profile.dmesg = options.OPTIONS.dmesg
 
+    if options.OPTIONS.monitored:
+        profile.monitoring = options.OPTIONS.monitored
+
     # This is resumed, don't bother with time since it wont be accurate anyway
     profile.run(results.options['log_level'], backend)
 
diff --git a/framework/results.py b/framework/results.py
index 3bc5363..f677f1e 100644
--- a/framework/results.py
+++ b/framework/results.py
@@ -148,7 +148,7 @@ class TestResult(object):
     """An object represting the result of a single test."""
     __slots__ = ['returncode', '_err', '_out', 'time', 'command', 'traceback',
                  'environment', 'subtests', 'dmesg', '__result', 'images',
-                 'exception', 'pid']
+                 'exception', 'pid', 'abort_error_source', 'abort_error']
     err = StringDescriptor('_err')
     out = StringDescriptor('_out')
 
@@ -163,6 +163,8 @@ class TestResult(object):
         self.traceback = None
         self.exception = None
         self.pid = None
+        self.abort_error_source = str()
+        self.abort_error = str()
         if result:
             self.result = result
         else:
@@ -203,6 +205,8 @@ class TestResult(object):
             'traceback': self.traceback,
             'dmesg': self.dmesg,
             'pid': self.pid,
+            'abort_error_source': self.abort_error_source,
+            'abort_error': self.abort_error,
         }
         return obj
 
@@ -222,7 +226,8 @@ class TestResult(object):
         inst = cls()
 
         for each in ['returncode', 'command', 'exception', 'environment',
-                     'time', 'traceback', 'result', 'dmesg', 'pid']:
+                     'time', 'traceback', 'result', 'dmesg', 'pid',
+                     'abort_error_source', 'abort_error']:
             if each in dict_:
                 setattr(inst, each, dict_[each])
 
diff --git a/framework/test/base.py b/framework/test/base.py
index 0fe0f74..07cad12 100644
--- a/framework/test/base.py
+++ b/framework/test/base.py
@@ -183,7 +183,7 @@ class Test(object):
             assert isinstance(timeout, int)
             self.timeout = timeout
 
-    def execute(self, path, log, dmesg):
+    def execute(self, path, log, dmesg, monitoring):
         """ Run a test
 
         Run a test, but with features. This times the test, uses dmesg checking
@@ -193,6 +193,7 @@ class Test(object):
         path -- the name of the test
         log -- a log.Log instance
         dmesg -- a dmesg.BaseDmesg derived class
+        monitoring -- a monitoring.Monitoring instance
 
         """
         log.start(path)
@@ -201,9 +202,11 @@ class Test(object):
             try:
                 self.result.time.start = time.time()
                 dmesg.update_dmesg()
+                monitoring.update_monitoring()
                 self.run()
                 self.result.time.end = time.time()
                 self.result = dmesg.update_result(self.result)
+                self.result = monitoring.update_result(self.result)
             # This is a rare case where a bare exception is okay, since we're
             # using it to log exceptions
             except:
diff --git a/piglit.conf.example b/piglit.conf.example
index 0410a17..a1e6571 100644
--- a/piglit.conf.example
+++ b/piglit.conf.example
@@ -138,3 +138,27 @@ run_test=./%(test_name)s
 [expected-crashes]
 ; Like expected-failures, but specifies that a test is expected to
 ; crash.
+
+[monitored-errors]
+; Set the monitoring rules for checking when the system need to be rebooted.
+; Piglit must be launched with --abort-on-monitored-error
+;
+; Each monitoring rule must define the type of monitoring (dmesg, file or device).
+; Depending on the type, the parameter 'parameters' is a filename or a list of options.
+; The regex is the pattern that causes Piglit aborting when it's found. Examples :
+;
+;i915_error_state={
+;      'type':'file',
+;      'parameters':'/sys/kernel/debug/dri/0/i915_error_state',
+;      'regex':'^((?!no error state collected).)*$'
+;     }
+;kmsg={
+;      'type':'device',
+;      'parameters':'/dev/kmsg',
+;      'regex':'\*ERROR\* ring create req|\*ERROR\* Failed to reset chip'
+;     }
+;dmesg_error={
+;      'type':'dmesg',
+;      'parameters':'dmesg --level emerg,alert,crit,err,warn,notice',
+;      'regex':'''\*ERROR\* ring create req|\*ERROR\* Failed to reset chip|BUG:|Oops:|turning off the locking correctness validator'''
+;    }
-- 
2.7.0