[PATCH v4 2/3] drm/xe: Sample gpu timestamp closer to exec queues

Fri Nov 8 05:33:17 UTC 2024

Move the force_wake_get to the beginning of the function so the gpu
timestamp can be closer to the sampled value for exec queues. This
avoids additional delays waiting for force wake ack which can make the
proportion between cycles/total_cycles fluctuate around the real value.
For a gputop-like application getting 2 samples to calculate the
utilization:

	sample 0:
		read_exec_queue_timestamp
					<<<< (A)
		read_gpu_timestamp
	sample 1:
		read_exec_queue_timestamp
					<<<<< (B)
		read_gpu_timestamp

In the above case, utilization can be bigger or smaller than it should
be, depending on if (A) or (B) receives additional delay, respectively.

With this a LNL system that was failing on
`xe_drm_fdinfo --r utilization-single-full-load` after ~60 iterations,
get to run to 100 without a failure. This is still not perfect, and it's
easy to introduce errors by just loading the CPU with `stress --cpu
$(nproc)` - the same igt test in this case fails after 2 or 3
iterations. That will be dealt with in the test itself, using a longer
sampling period.

v2: Rename function and add another to get "any engine", preparing for
    caching the hwe in future (Umesh / Jonathan)

Signed-off-by: Lucas De Marchi <lucas.demarchi at intel.com>
---
 drivers/gpu/drm/xe/xe_drm_client.c | 73 ++++++++++++++++++++----------
 1 file changed, 49 insertions(+), 24 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_drm_client.c b/drivers/gpu/drm/xe/xe_drm_client.c
index dd4e16a84874c..298a587da7f17 100644
--- a/drivers/gpu/drm/xe/xe_drm_client.c
+++ b/drivers/gpu/drm/xe/xe_drm_client.c
@@ -269,6 +269,49 @@ static void show_meminfo(struct drm_printer *p, struct drm_file *file)
 	}
 }
 
+static struct xe_hw_engine *any_engine(struct xe_device *xe)
+{
+	struct xe_gt *gt;
+	unsigned long gt_id;
+
+	for_each_gt(gt, xe, gt_id) {
+		struct xe_hw_engine *hwe = xe_gt_any_hw_engine(gt);
+
+		if (hwe)
+			return hwe;
+	}
+
+	return NULL;
+}
+
+static bool force_wake_get_any_engine(struct xe_device *xe,
+				      struct xe_hw_engine **phwe,
+				      unsigned int *pfw_ref)
+{
+	enum xe_force_wake_domains domain;
+	unsigned int fw_ref;
+	struct xe_hw_engine *hwe;
+	struct xe_force_wake *fw;
+
+	hwe = any_engine(xe);
+	if (!hwe)
+		return false;
+
+	domain = xe_hw_engine_to_fw_domain(hwe);
+	fw = gt_to_fw(hwe->gt);
+
+	fw_ref = xe_force_wake_get(fw, domain);
+	if (!xe_force_wake_ref_has_domain(fw_ref, domain)) {
+		xe_force_wake_put(fw, fw_ref);
+		return false;
+	}
+
+	*phwe = hwe;
+	*pfw_ref = fw_ref;
+
+	return true;
+}
+
 static void show_run_ticks(struct drm_printer *p, struct drm_file *file)
 {
 	unsigned long class, i, gt_id, capacity[XE_ENGINE_CLASS_MAX] = { };
@@ -288,6 +331,10 @@ static void show_run_ticks(struct drm_printer *p, struct drm_file *file)
 		       !atomic_read(&xef->exec_queue.pending_removal));
 
 	xe_pm_runtime_get(xe);
+	if (!force_wake_get_any_engine(xe, &hwe, &fw_ref)) {
+		xe_pm_runtime_put(xe);
+		return;
+	}
 
 	/* Accumulate all the exec queues from this client */
 	mutex_lock(&xef->exec_queue.lock);
@@ -302,33 +349,11 @@ static void show_run_ticks(struct drm_printer *p, struct drm_file *file)
 	}
 	mutex_unlock(&xef->exec_queue.lock);
 
-	/* Get the total GPU cycles */
-	for_each_gt(gt, xe, gt_id) {
-		enum xe_force_wake_domains fw;
-
-		hwe = xe_gt_any_hw_engine(gt);
-		if (!hwe)
-			continue;
-
-		fw = xe_hw_engine_to_fw_domain(hwe);
-
-		fw_ref = xe_force_wake_get(gt_to_fw(gt), fw);
-		if (!xe_force_wake_ref_has_domain(fw_ref, fw)) {
-			hwe = NULL;
-			xe_force_wake_put(gt_to_fw(gt), fw_ref);
-			break;
-		}
-
-		gpu_timestamp = xe_hw_engine_read_timestamp(hwe);
-		xe_force_wake_put(gt_to_fw(gt), fw_ref);
-		break;
-	}
+	gpu_timestamp = xe_hw_engine_read_timestamp(hwe);
 
+	xe_force_wake_put(gt_to_fw(hwe->gt), fw_ref);
 	xe_pm_runtime_put(xe);
 
-	if (unlikely(!hwe))
-		return;
-
 	for (class = 0; class < XE_ENGINE_CLASS_MAX; class++) {
 		const char *class_name;
 
-- 
2.47.0