[PATCH i-g-t 3/4] i915/sysfs_heartbeat_interval: Determine covariance of request vs result

Zbigniew Kempczyński zbigniew.kempczynski at intel.com
Thu Jan 26 10:20:39 UTC 2023


From: Chris Wilson <chris.p.wilson at intel.com>

The crux of using the sysfs interface to set the heartbeat interval is
that the user gets the interval they request. However, there are system
overheads and systematic errors in measurement that prevent a simple
analysis (such as the cpu scheduler will delay the worker by a jiffy, or
that we print so much debug information to the serial console that it
takes over a second to declare a hang). Switch to using linear
regression analysis to determine if the result does correlate to the
user's request and provide more robust estimates of the overhead and
number of pulses before a hang is declared.

Signed-off-by: Chris Wilson <chris.p.wilson at intel.com>
Reviewed-by: Stuart Summers <stuart.summers at intel.com>
Reviewed-by: Andrzej Hajda <andrzej.hajda at intel.com>
---
 lib/igt_stats.c                       | 27 ++++++++++++
 lib/igt_stats.h                       |  9 ++++
 tests/i915/sysfs_heartbeat_interval.c | 61 ++++++++++++++++++---------
 3 files changed, 77 insertions(+), 20 deletions(-)

diff --git a/lib/igt_stats.c b/lib/igt_stats.c
index e0859a0318..f159cb3a05 100644
--- a/lib/igt_stats.c
+++ b/lib/igt_stats.c
@@ -671,3 +671,30 @@ double igt_mean_get_variance(struct igt_mean *m)
 	return m->sq / m->count;
 }
 
+void igt_covariance_add(struct igt_covariance *cv, double x, double y)
+{
+	cv->sx += x;
+	cv->sxx += x * x;
+	cv->sxy += x * y;
+	cv->syy += y * y;
+	cv->sy += y;
+	cv->count++;
+}
+
+double igt_covariance_get(struct igt_covariance *cv, double *m, double *b)
+{
+	double d_sxx, d_sxy, d_syy;
+	double R;
+
+	d_sxx = cv->count * cv->sxx - cv->sx * cv->sx;
+	d_sxy = cv->count * cv->sxy - cv->sx * cv->sy;
+	d_syy = cv->count * cv->syy - cv->sy * cv->sy;
+
+	*m = d_sxy / d_sxx;
+	*b = (cv->sy - *m * cv->sx) / cv->count;
+	R = d_sxy  / sqrt(d_sxx * d_syy);
+
+	igt_debug("co-variance correlation:%.3f, intercept:%.2f, slope:%.2f\n",
+		  R, *b, *m);
+	return R;
+}
diff --git a/lib/igt_stats.h b/lib/igt_stats.h
index e8ee5a5871..00d0c37985 100644
--- a/lib/igt_stats.h
+++ b/lib/igt_stats.h
@@ -101,4 +101,13 @@ void igt_mean_add(struct igt_mean *m, double v);
 double igt_mean_get(struct igt_mean *m);
 double igt_mean_get_variance(struct igt_mean *m);
 
+struct igt_covariance {
+	/*< private >*/
+	double sx, sxx, sxy, syy, sy;
+	unsigned int count;
+};
+
+void igt_covariance_add(struct igt_covariance *cv, double x, double y);
+double igt_covariance_get(struct igt_covariance *cv, double *m, double *b);
+
 #endif /* __IGT_STATS_H__ */
diff --git a/tests/i915/sysfs_heartbeat_interval.c b/tests/i915/sysfs_heartbeat_interval.c
index 8cebf6270f..2bf57ca109 100644
--- a/tests/i915/sysfs_heartbeat_interval.c
+++ b/tests/i915/sysfs_heartbeat_interval.c
@@ -39,6 +39,7 @@
 #include "intel_allocator.h"
 #include "igt_debugfs.h"
 #include "igt_dummyload.h"
+#include "igt_stats.h"
 #include "igt_sysfs.h"
 #include "sw_sync.h"
 
@@ -189,9 +190,30 @@ static uint64_t __test_timeout(int i915, int engine, unsigned int timeout)
 	return elapsed;
 }
 
+static void check_heart_rate(struct igt_covariance *cv)
+{
+	double m, b, R;
+
+	R = igt_covariance_get(cv, &m, &b);
+	igt_assert_f(R > 0.9,
+		     "Heartbeat interval not increasing linearly, correlation:%.3f\n", R);
+
+	igt_assert_f(m > 0.5,
+		     "At least one heartbeat interval must pass before a hang is destected; measured %.1f\n",
+		     m);
+	igt_assert_f(m < 10, /* Assume 5 misses -> hang */
+		     "A few heartbeats may pass before detection of stall, averaged %.1f pulses\n",
+		     m);
+
+	igt_assert_f(b < 2 * RESET_TIMEOUT,
+		     "Heartbeat reset/recovery:%.1fms exceeded expectations:%dms\n",
+		     b, RESET_TIMEOUT);
+}
+
 static void test_precise(int i915, int engine)
 {
-	int delays[] = { 1, 50, 100, 500 };
+	const int delays[] = { 1, 50, 100, 200, 300, 400, 500 };
+	struct igt_covariance cv = {};
 	unsigned int saved;
 
 	/*
@@ -215,30 +237,30 @@ static void test_precise(int i915, int engine)
 	gem_quiescent_gpu(i915);
 
 	for (int i = 0; i < ARRAY_SIZE(delays); i++) {
-		uint64_t elapsed;
+		double elapsed;
 
-		elapsed = __test_timeout(i915, engine, delays[i]);
-		igt_info("%s:%d, elapsed=%.3fms[%d]\n", ATTR,
-			 delays[i], elapsed * 1e-6,
-			 (int)(elapsed / 1000 / 1000));
+		elapsed = __test_timeout(i915, engine, delays[i]) * 1e-6;
+		igt_info("%s:%d, elapsed=%.3fms\n", ATTR,
+			 delays[i], elapsed);
 
 		/*
 		 * It takes a couple of missed heartbeats before we start
 		 * terminating hogs, and a little bit of jiffie slack for
-		 * scheduling at each step. 150ms should cover all of our
-		 * sins and be useful tolerance.
+		 * scheduling at each step.
 		 */
-		igt_assert_f(elapsed / 1000 / 1000 < 3 * delays[i] + 150,
-			     "Heartbeat interval (and CPR) exceeded request!\n");
+		igt_covariance_add(&cv, delays[i], elapsed);
 	}
 
 	gem_quiescent_gpu(i915);
 	set_heartbeat(engine, saved);
+
+	check_heart_rate(&cv);
 }
 
 static void test_nopreempt(int i915, int engine)
 {
-	int delays[] = { 1, 50, 100, 500 };
+	const int delays[] = { 1, 50, 100, 200, 300, 400, 500 };
+	struct igt_covariance cv = {};
 	unsigned int saved;
 
 	/*
@@ -258,25 +280,24 @@ static void test_nopreempt(int i915, int engine)
 	gem_quiescent_gpu(i915);
 
 	for (int i = 0; i < ARRAY_SIZE(delays); i++) {
-		uint64_t elapsed;
+		double  elapsed;
 
-		elapsed = __test_timeout(i915, engine, delays[i]);
-		igt_info("%s:%d, elapsed=%.3fms[%d]\n", ATTR,
-			 delays[i], elapsed * 1e-6,
-			 (int)(elapsed / 1000 / 1000));
+		elapsed = __test_timeout(i915, engine, delays[i]) * 1e-6;
+		igt_info("%s:%d, elapsed=%.3fms\n", ATTR,
+			 delays[i], elapsed);
 
 		/*
 		 * It takes a few missed heartbeats before we start
 		 * terminating hogs, and a little bit of jiffie slack for
-		 * scheduling at each step. 500ms should cover all of our
-		 * sins (including debug dumps) and be useful tolerance.
+		 * scheduling at each step.
 		 */
-		igt_assert_f(elapsed / 1000 / 1000 < 5 * delays[i] + 500,
-			     "Heartbeat interval (and CPR) exceeded request!\n");
+		igt_covariance_add(&cv, delays[i], elapsed);
 	}
 
 	gem_quiescent_gpu(i915);
 	set_heartbeat(engine, saved);
+
+	check_heart_rate(&cv);
 }
 
 static unsigned int measured_usleep(unsigned int usec)
-- 
2.34.1



More information about the Intel-gfx-trybot mailing list