[PATCH v3 5/5] drm/xe: Sample gpu timestamp closer to exec queues

Mon Nov 4 20:10:08 UTC 2024

On Mon, Nov 04, 2024 at 11:12:51AM -0800, Umesh Nerlige Ramappa wrote:
>On Mon, Nov 04, 2024 at 06:38:15AM -0800, Lucas De Marchi wrote:
>>Move the force_wake_get to the beginning of the function so the gpu
>>timestamp can be closer to the sampled value for exec queues. This
>>avoids additional delays waiting for force wake ack which can make the
>>proportion between cycles/total_cycles fluctuate either to more or to
>>less than the real value, depending if the delay happened in the first
>>or second sample of a top-like application:
>>
>>	sample 0:
>>		read_exec_queue_timestamp
>>					<<<< (A)
>>		read_gpu_timestamp
>>	sample 1:
>>		read_exec_queue_timestamp
>>					<<<<< (B)
>>		read_gpu_timestamp
>
>This is inevitable in sampling where 2 values are sampled in each 
>iteration and a ratio is taken. Since you are trying to reduce the 
>delta between the 2 values in each sample, this patch does address it 
>compared to before, so this is
>
>Reviewed-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa at intel.com>
>
>Although, I do have some concerns/suggestions which can be addressed 
>now/later.
>
>1) gt timestamp is theoretically allowed to tick at different 
>frequencies for media vs primary GT. We could run into a compatibility 
>issue between run ticks and the gt timestamp at some point. I would 
>think q->class and gt used should be compatible.

this was considered when enabling it for the first time, but this is not
true in any platform. In theory it could be different for any HWE and
we'd need to read it again, but there's no point if it doesn't
correspond to reality.

>2) can we cache the "any hwe" early on at driver load and also 
>whenever it changes at runtime? The fact that we find the hwe on every 
>fdinfo query seems unnecessary.

good idea... maybe I should do that already instead of this patch, so
the function we extract is actually the helper function to select the
hwe. Then the force_wake get/put becomes trivial and we don't need to
extract it like done here.

I will prep something to see how it looks and update.

thanks
Lucas De Marchi

>
>Thanks,
>Umesh
>
>>
>>Additional delay in (A) will cause the utilization to be bigger
>>than it should. Additional delay in (B) will cause the utilization to be
>>smaller than it should.
>>
>>With this a LNL system that was failing after ~60 iterations, get to run
>>to 100 without a failure. This is still not perfect, and it's easy to
>>introduce errors by just loading the CPU with `stress --cpu $(nproc)` -
>>the same igt test in this case fails after 2 or 3 iterations.
>>
>>Although not perfect, this could be viewed as just extracting that ugly
>>loop to "find the forcewake for any available engine" into a separate
>>function, and at the same time improving the results a little bit.
>>
>>Signed-off-by: Lucas De Marchi <lucas.demarchi at intel.com>
>>---
>>drivers/gpu/drm/xe/xe_drm_client.c | 63 ++++++++++++++++++------------
>>1 file changed, 39 insertions(+), 24 deletions(-)
>>
>>diff --git a/drivers/gpu/drm/xe/xe_drm_client.c b/drivers/gpu/drm/xe/xe_drm_client.c
>>index 24a0a7377abf2..874c2157baf49 100644
>>--- a/drivers/gpu/drm/xe/xe_drm_client.c
>>+++ b/drivers/gpu/drm/xe/xe_drm_client.c
>>@@ -269,6 +269,40 @@ static void show_meminfo(struct drm_printer *p, struct drm_file *file)
>>	}
>>}
>>
>>+static bool force_wake_get_for_gpu_timestamp(struct xe_device *xe,
>>+					     struct xe_hw_engine **phwe,
>>+					     unsigned int *pfw_ref)
>>+{
>>+	struct xe_gt *gt;
>>+	unsigned long gt_id;
>>+
>>+	for_each_gt(gt, xe, gt_id) {
>>+		struct xe_hw_engine *hwe;
>>+		enum xe_force_wake_domains domain;
>>+		unsigned int fw_ref;
>>+		struct xe_force_wake *fw;
>>+
>>+		hwe = xe_gt_any_hw_engine(gt);
>>+		if (!hwe)
>>+			continue;
>>+
>>+		domain = xe_hw_engine_to_fw_domain(hwe);
>>+		fw = gt_to_fw(gt);
>>+
>>+		fw_ref = xe_force_wake_get(fw, domain);
>>+		if (!xe_force_wake_ref_has_domain(fw_ref, domain)) {
>>+			xe_force_wake_put(fw, fw_ref);
>>+			return false;
>>+		}
>>+
>>+		*phwe = hwe;
>>+		*pfw_ref = fw_ref;
>>+		return true;
>>+	}
>>+
>>+	return false;
>>+}
>>+
>>static void show_run_ticks(struct drm_printer *p, struct drm_file *file)
>>{
>>	unsigned long class, i, gt_id, capacity[XE_ENGINE_CLASS_MAX] = { };
>>@@ -282,6 +316,9 @@ static void show_run_ticks(struct drm_printer *p, struct drm_file *file)
>>
>>	xe_pm_runtime_get(xe);
>>
>>+	if (!force_wake_get_for_gpu_timestamp(xe, &hwe, &fw_ref))
>>+		return;
>>+
>>	/* Accumulate all the exec queues from this client */
>>	mutex_lock(&xef->exec_queue.lock);
>>	xa_for_each(&xef->exec_queue.xa, i, q) {
>>@@ -295,27 +332,7 @@ static void show_run_ticks(struct drm_printer *p, struct drm_file *file)
>>	}
>>	mutex_unlock(&xef->exec_queue.lock);
>>
>>-	/* Get the total GPU cycles */
>>-	for_each_gt(gt, xe, gt_id) {
>>-		enum xe_force_wake_domains fw;
>>-
>>-		hwe = xe_gt_any_hw_engine(gt);
>>-		if (!hwe)
>>-			continue;
>>-
>>-		fw = xe_hw_engine_to_fw_domain(hwe);
>>-
>>-		fw_ref = xe_force_wake_get(gt_to_fw(gt), fw);
>>-		if (!xe_force_wake_ref_has_domain(fw_ref, fw)) {
>>-			hwe = NULL;
>>-			xe_force_wake_put(gt_to_fw(gt), fw_ref);
>>-			break;
>>-		}
>>-
>>-		gpu_timestamp = xe_hw_engine_read_timestamp(hwe);
>>-		xe_force_wake_put(gt_to_fw(gt), fw_ref);
>>-		break;
>>-	}
>>+	gpu_timestamp = xe_hw_engine_read_timestamp(hwe);
>>
>>	/*
>>	 * Wait for any exec queue going away: their cycles will get updated on
>>@@ -324,11 +341,9 @@ static void show_run_ticks(struct drm_printer *p, struct drm_file *file)
>>	wait_var_event(&xef->exec_queue.pending_removal,
>>		       !atomic_read(&xef->exec_queue.pending_removal));
>>
>>+	xe_force_wake_put(gt_to_fw(hwe->gt), fw_ref);
>>	xe_pm_runtime_put(xe);
>>
>>-	if (unlikely(!hwe))
>>-		return;
>>-
>>	for (class = 0; class < XE_ENGINE_CLASS_MAX; class++) {
>>		const char *class_name;
>>
>>-- 
>>2.47.0
>>