[Piglit] [PATCH] Add dmesg option for reboot policy
yann.argotti at linux.intel.com
yann.argotti at linux.intel.com
Tue Nov 24 04:14:23 PST 2015
From: Yann Argotti <yann.argotti at linux.intel.com>
Date: Tue, 24 Nov 2015 12:16:34 +0100
This adds a policy which advises when user should reboot system to avoid
noisy test results due to system becoming unstable, for instance, and
therefore continues testing successfully. To do this, a new Dmesg class is
proposed which is not filtering dmesg and monitors whether or not one of
the following event occurs:
- gpu reset failed (not just gpu reset happened, that happens
way too often and many tests even provoke hangs intentionally) - gpu crash,
- Oops: - BUG - lockdep splat that causes the locking validator to get
disabled If one of these issues happen, piglit test execution is stopped
-terminating test thread pool- and exit with code 3 to inform that reboot is
advised. Then test execution resume, after rebooting system or not, is done
like usually with command line parameter "resume".
To call it, use command line parameter: --dmesg monitored
(this is original patch for review and therefore didn't apply any proposed
change yet)
Signed-off-by: Yann Argotti <yann.argotti at linux.intel.com>
---
framework/dmesg.py | 118
+++++++++++++++++++++++++++++++++++++++++++++-
framework/exceptions.py | 6 ++-
framework/profile.py | 35 ++++++++++++--
framework/programs/run.py | 45 ++++++++++++++----
4 files changed, 185 insertions(+), 19 deletions(-)
diff --git a/framework/dmesg.py b/framework/dmesg.py
index daeeeae..66c95f5 100644
--- a/framework/dmesg.py
+++ b/framework/dmesg.py
@@ -46,6 +46,7 @@ import gzip
__all__ = [
'BaseDmesg',
'LinuxDmesg',
+ 'LinuxMonitoredDmesg',
'DummyDmesg',
'get_dmesg',
]
@@ -212,6 +213,116 @@ class LinuxDmesg(BaseDmesg):
# Attempt to store the last element of dmesg, unless there was no
dmesg
self._last_message = dmesg[-1] if dmesg else None
+class LinuxMonitoredDmesg(BaseDmesg):
+ """ Read and monitor dmesg on posix systems
+
+ This reads the dmesg ring buffer, stores the contents of the buffer, and
+ then compares it whenever called, returning the difference between the
+ calls. It requires timestamps to be enabled.
+
+ """
+ # We are considering all levels of dmesg to avoid missing
+ # any important ones and with an inaccurate reporting level
+ DMESG_COMMAND = ['dmesg']
+ # With _first_dmesg_process, we are not checking dmesg at test execution
+ # starting time: user must know current machine/GPU state before
+ # starting piglit: we can resume test execution based on last stopping
+ # if we consider that our system is still in good shape (ie not blocking
+ # run forever)
+ _first_dmesg_process = True
+ _reboot_needed = False
+
+ def __init__(self):
+ """ Create a dmesg instance """
+ # _last_message store the contents of the last entry in dmesg the
last
+ # time it was read. Because dmesg is a ringbuffer old entries can
fall
+ # off the end if it becomes too full. To track new entries search
for
+ # this entry and take any entries that were added after it. This
works
+ # on Linux because each entry is guaranteed unique by timestamps.
+ self._last_message = None
+ super(LinuxMonitoredDmesg, self).__init__()
+ self._reboot_needed = False
+
+ if not self._last_message:
+ # We need to ensure that there is something in dmesg to check
+ # timestamps.
+ warnings.warn("Cannot check dmesg for timestamp support. If
you "
+ "do not have timestamps enabled in your kernel
you "
+ "get incomplete dmesg captures", RuntimeWarning)
+ elif not re.match(r'\[\s*\d+\.\d+\]', self._last_message):
+ # Do an initial check to ensure that dmesg has timestamps, we
need
+ # timestamps to work correctly. A proper linux dmesg timestamp
+ # looks like this: [ 0.00000]
+ raise DmesgError(
+ "Your kernel does not seem to support timestamps, which "
+ "are required by the --dmesg option")
+
+ def reboot_is_needed(self):
+ """ Simply return the reboot_needed variable state """
+ return self._reboot_needed
+
+ def update_dmesg(self):
+ """ Call dmesg using subprocess.check_output
+
+ Get the contents of dmesg, then calculate new messages, finally set
+ self.dmesg equal to the just read contents of dmesg.
+
+ """
+ dmesg =
subprocess.check_output(self.DMESG_COMMAND).strip().splitlines()
+
+ # Find all new entries, do this by slicing the list of dmesg to only
+ # returns elements after the last element stored. If there are not
+ # matches a value error is raised, that means all of dmesg is new
+ l = 0
+ for index, item in enumerate(reversed(dmesg)):
+ if item == self._last_message:
+ l = len(dmesg) - index # don't include the matched element
+ break
+ self._new_messages = dmesg[l:]
+
+ if (not self._first_dmesg_process):
+ if (any("[drm] Failed to reset chip" in msg for msg in
self._new_messages) or
+ any("[drm] GPU crash" in msg for msg in
self._new_messages) or
+ any("Oops:" in msg for msg in self._new_messages) or
+ any("BUG:" in msg for msg in self._new_messages) or
+ any("turning off the locking correctness validator" in
msg for msg in self._new_messages)):
+ self._reboot_needed = True
+ else:
+ self._first_dmesg_process = False
+
+ # Attempt to store the last element of dmesg, unless there was no
dmesg
+ self._last_message = dmesg[-1] if dmesg else None
+
+ def update_result(self, result):
+ """ Takes a TestResult object and updates it with dmesg statuses
+
+ There is no filtering/change of status in dmesg and return the
+ original result passed in.
+
+ Arguments:
+ result -- A TestResult instance
+
+ """
+
+ # Get a new snapshot of dmesg
+ self.update_dmesg()
+
+ # if update_dmesg() found new entries replace the results of the
test
+ # and subtests
+ if self._new_messages:
+
+ if self.regex:
+ for line in self._new_messages:
+ if self.regex.search(line):
+ break
+ else:
+ return result
+
+ # Add the dmesg values to the result
+ result.dmesg = "\n".join(self._new_messages)
+
+ return result
+
class DummyDmesg(BaseDmesg):
""" An dummy class for dmesg on non unix-like systems
@@ -239,7 +350,7 @@ class DmesgError(EnvironmentError):
pass
-def get_dmesg(not_dummy=True):
+def get_dmesg(not_dummy=True, monitored=False):
""" Return a Dmesg type instance
Normally this does a system check, and returns the type that proper for
@@ -248,5 +359,8 @@ def get_dmesg(not_dummy=True):
"""
if sys.platform.startswith('linux') and not_dummy:
- return LinuxDmesg()
+ if monitored:
+ return LinuxMonitoredDmesg()
+ else:
+ return LinuxDmesg()
return DummyDmesg()
diff --git a/framework/exceptions.py b/framework/exceptions.py
index fdd1f1f..c6adfc3 100644
--- a/framework/exceptions.py
+++ b/framework/exceptions.py
@@ -46,8 +46,9 @@ def handler(func):
@functools.wraps(func)
def _inner(*args, **kwargs):
- try:
- func(*args, **kwargs)
+ returncode = 0
+ try:
+ returncode = func(*args, **kwargs)
except PiglitFatalError as e:
print('Fatal Error: {}'.format(str(e)), file=sys.stderr)
sys.exit(1)
@@ -67,6 +68,7 @@ def handler(func):
if _DEBUG:
raise
sys.exit(1)
+ return returncode
return _inner
diff --git a/framework/profile.py b/framework/profile.py
index 32ed759..165d043 100644
--- a/framework/profile.py
+++ b/framework/profile.py
@@ -34,6 +34,8 @@ import importlib
import contextlib
import itertools
+from functools import partial
+
from framework import grouptools, exceptions, options
from framework.dmesg import get_dmesg
from framework.log import LogManager
@@ -165,15 +167,25 @@ class TestProfile(object):
return self._dmesg
@dmesg.setter
- def dmesg(self, not_dummy):
+ def dmesg(self, vals):
""" Set dmesg
- Arguments:
+ Arguments: (contain in vals)
+
not_dummy -- if Truthy dmesg will try to get a PosixDmesg, if
Falsy it
will get a DummyDmesg
+ monitored -- if True, invoke also that dmesg are monitored to
stop test
+ execution and advise to reboot (exit code 3)
+
+ if vals is not iterable, it assume this is a signle value
"""
- self._dmesg = get_dmesg(not_dummy)
+ try:
+ not_dummy, monitored = vals
+ except TypeError, ValueError:
+ not_dummy = vals
+ monitored = False
+ self._dmesg = get_dmesg(not_dummy, monitored)
def _prepare_test_list(self):
""" Prepare tests for running
@@ -253,16 +265,19 @@ class TestProfile(object):
self._prepare_test_list()
log = LogManager(logger, len(self.test_list))
- def test(pair):
+ def test(pair, this_pool=None):
"""Function to call test.execute from map"""
name, test = pair
with backend.write_test(name) as w:
test.execute(name, log.get(), self.dmesg)
w(test.result)
+ # in case current dmesg is supporting this property, check it
+ if self.dmesg.reboot_is_needed():
+ this_pool.terminate()
def run_threads(pool, testlist):
""" Open a pool, close it, and join it """
- pool.imap(test, testlist, chunksize)
+ pool.imap(partial(test, this_pool=pool), testlist, chunksize)
pool.close()
pool.join()
@@ -289,6 +304,16 @@ class TestProfile(object):
self._post_run_hook()
+ # return code 3 is to say a reboot is required
+ try:
+ if self.dmesg.reboot_is_needed():
+ return 3
+ except AttributeError:
+ pass
+
+ return 0
+
+
def filter_tests(self, function):
"""Filter out tests that return false from the supplied function
diff --git a/framework/programs/run.py b/framework/programs/run.py
index 328e0b0..f4f82dd 100644
--- a/framework/programs/run.py
+++ b/framework/programs/run.py
@@ -137,9 +137,17 @@ def _run_parser(input_):
action="store_true",
help="Run tests in valgrind's memcheck")
parser.add_argument("--dmesg",
- action="store_true",
- help="Capture a difference in dmesg before and "
- "after each test. Implies -1/--no-concurrency")
+ dest="dmesg",
+ action="store",
+ choices=['none', 'simple', 'monitored'],
+ default='none',
+ help="Set level of dmesg usage: "
+ "none={no dmesg use - default}, "
+ "simple={Capture a difference in dmesg
before and "
+ "after each test. Implies
-1/--no-concurrency}, "
+ "monitored={simple mode + monitor dmesg and
stop "
+ "test execution when a reboot may be required "
+ "-exit code 3}")
parser.add_argument("-s", "--sync",
action="store_true",
help="Sync results to disk after every test")
@@ -223,7 +231,7 @@ def run(input_):
# If dmesg is requested we must have serial run, this is becasue dmesg
# isn't reliable with threaded run
- if args.dmesg:
+ if args.dmesg != 'none':
args.concurrency = "none"
# build up the include filter based on test_list
@@ -268,9 +276,16 @@ def run(input_):
results.time_elapsed.start = time.time()
# Set the dmesg type
- if args.dmesg:
- profile.dmesg = args.dmesg
- profile.run(args.log_level, backend)
+ if args.dmesg != 'none':
+ profile.dmesg = (True, args.dmesg=='monitored')
+
+ returncode = profile.run(args.log_level, backend)
+ # in case something went wrong (returncode is not 0)
+ if returncode != 0:
+ print('Aborting Piglit execution: code (', returncode,')')
+ if returncode == 3:
+ print('~> Advising to reboot\n')
+ return returncode
results.time_elapsed.end = time.time()
backend.finalize({'time_elapsed': results.time_elapsed})
@@ -278,6 +293,8 @@ def run(input_):
print('Thank you for running Piglit!\n'
'Results have been written to ' + args.results_path)
+ return 0
+
@exceptions.handler
def resume(input_):
@@ -328,13 +345,21 @@ def resume(input_):
profile =
framework.profile.merge_test_profiles(results.options['profile'])
profile.results_dir = args.results_path
- if options.OPTIONS.dmesg:
- profile.dmesg = options.OPTIONS.dmesg
+ if options.OPTIONS.dmesg != 'none':
+ profile.dmesg = (True, options.OPTIONS.dmesg=='monitored')
# This is resumed, don't bother with time since it wont be accurate
anyway
- profile.run(results.options['log_level'], backend)
+ returncode = profile.run(results.options['log_level'], backend)
+ # in case something went wrong (returncode is not 0)
+ if returncode != 0:
+ print('Aborting Piglit execution: code (', returncode,')')
+ if returncode == 3:
+ print('~> Advising to reboot\n')
+ return returncode
backend.finalize()
print("Thank you for running Piglit!\n"
"Results have been written to {0}".format(args.results_path))
+
+ return 0
--
1.9.1
More information about the Piglit
mailing list