[PATCH v3 5/5] drm/xe: Sample gpu timestamp closer to exec queues

Mon Nov 4 16:11:16 UTC 2024

-----Original Message-----
From: Intel-xe <intel-xe-bounces at lists.freedesktop.org> On Behalf Of Lucas De Marchi
Sent: Monday, November 4, 2024 6:38 AM
To: intel-xe at lists.freedesktop.org
Cc: De Marchi, Lucas <lucas.demarchi at intel.com>
Subject: [PATCH v3 5/5] drm/xe: Sample gpu timestamp closer to exec queues
> 
> Move the force_wake_get to the beginning of the function so the gpu
> timestamp can be closer to the sampled value for exec queues. This
> avoids additional delays waiting for force wake ack which can make the
> proportion between cycles/total_cycles fluctuate either to more or to
> less than the real value,

s/either to more or to less than/around

> depending if the delay happened in the first
> or second sample of a top-like application:

Top-like application?  You mean a spinning application?

> 
> 	sample 0:
> 		read_exec_queue_timestamp
> 					<<<< (A)
> 		read_gpu_timestamp
> 	sample 1:
> 		read_exec_queue_timestamp
> 					<<<<< (B)
> 		read_gpu_timestamp
> 
> Additional delay in (A) will cause the utilization to be bigger
> than it should. Additional delay in (B) will cause the utilization to be
> smaller than it should.

Maybe:

"""
In the above case, utilization can be bigger or smaller than it should
be, depending on if (A) or (B) receives additional delay, respectively.
"""

> 
> With this a LNL system that was failing after ~60 iterations, get to run
> to 100 without a failure. This is still not perfect, and it's easy to
> introduce errors by just loading the CPU with `stress --cpu $(nproc)` -
> the same igt test in this case fails after 2 or 3 iterations.

I'd say either specify the failing IGT test or cut this paragraph.

> 
> Although not perfect, this could be viewed as just extracting that ugly
> loop to "find the forcewake for any available engine" into a separate
> function, and at the same time improving the results a little bit.

This seems like an aside and can probably be cut, though if
you think it's necessary to keep this section to make this
patch more likely to successfully land, then IMO you
should probably merge it with the first paragraph.

Something like:

"""
Move the force_wake_get to the beginning of the show_run_ticks
function so the gpu timestamp can be closer to the sampled value
for exec queues.  This cleans the code by extracting the for-loop
that finds the forcewake of the first available engine into its own
function.  Additionally, this helps avoid additional delays waiting
for force wake ack which can...
"""

Note that by saying it "helps avoid" rather than "avoids", we go
from a full fix to one that reduces harm, meaning we don't have
to specify the imperfection of this fix later.

> 
> Signed-off-by: Lucas De Marchi <lucas.demarchi at intel.com>
> ---
>  drivers/gpu/drm/xe/xe_drm_client.c | 63 ++++++++++++++++++------------
>  1 file changed, 39 insertions(+), 24 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_drm_client.c b/drivers/gpu/drm/xe/xe_drm_client.c
> index 24a0a7377abf2..874c2157baf49 100644
> --- a/drivers/gpu/drm/xe/xe_drm_client.c
> +++ b/drivers/gpu/drm/xe/xe_drm_client.c
> @@ -269,6 +269,40 @@ static void show_meminfo(struct drm_printer *p, struct drm_file *file)
>  	}
>  }
>  
> +static bool force_wake_get_for_gpu_timestamp(struct xe_device *xe,
> +					     struct xe_hw_engine **phwe,
> +					     unsigned int *pfw_ref)
> +{

This is amenable, though perhaps we should just call it
"force_wake_get_any_engine" or
"force_wake_get_first_engine"?

> +	struct xe_gt *gt;
> +	unsigned long gt_id;
> +
> +	for_each_gt(gt, xe, gt_id) {
> +		struct xe_hw_engine *hwe;
> +		enum xe_force_wake_domains domain;
> +		unsigned int fw_ref;
> +		struct xe_force_wake *fw;
> +
> +		hwe = xe_gt_any_hw_engine(gt);
> +		if (!hwe)
> +			continue;
> +
> +		domain = xe_hw_engine_to_fw_domain(hwe);
> +		fw = gt_to_fw(gt);
> +
> +		fw_ref = xe_force_wake_get(fw, domain);
> +		if (!xe_force_wake_ref_has_domain(fw_ref, domain)) {
> +			xe_force_wake_put(fw, fw_ref);
> +			return false;
> +		}
> +
> +		*phwe = hwe;
> +		*pfw_ref = fw_ref;
> +		return true;
> +	}
> +
> +	return false;
> +}
> +
>  static void show_run_ticks(struct drm_printer *p, struct drm_file *file)
>  {
>  	unsigned long class, i, gt_id, capacity[XE_ENGINE_CLASS_MAX] = { };
> @@ -282,6 +316,9 @@ static void show_run_ticks(struct drm_printer *p, struct drm_file *file)
>  
>  	xe_pm_runtime_get(xe);
>  
> +	if (!force_wake_get_for_gpu_timestamp(xe, &hwe, &fw_ref))
> +		return;
> +
>  	/* Accumulate all the exec queues from this client */
>  	mutex_lock(&xef->exec_queue.lock);
>  	xa_for_each(&xef->exec_queue.xa, i, q) {
> @@ -295,27 +332,7 @@ static void show_run_ticks(struct drm_printer *p, struct drm_file *file)
>  	}
>  	mutex_unlock(&xef->exec_queue.lock);
>  
> -	/* Get the total GPU cycles */
> -	for_each_gt(gt, xe, gt_id) {
> -		enum xe_force_wake_domains fw;
> -
> -		hwe = xe_gt_any_hw_engine(gt);
> -		if (!hwe)
> -			continue;
> -
> -		fw = xe_hw_engine_to_fw_domain(hwe);
> -
> -		fw_ref = xe_force_wake_get(gt_to_fw(gt), fw);
> -		if (!xe_force_wake_ref_has_domain(fw_ref, fw)) {
> -			hwe = NULL;
> -			xe_force_wake_put(gt_to_fw(gt), fw_ref);
> -			break;
> -		}
> -
> -		gpu_timestamp = xe_hw_engine_read_timestamp(hwe);
> -		xe_force_wake_put(gt_to_fw(gt), fw_ref);
> -		break;
> -	}
> +	gpu_timestamp = xe_hw_engine_read_timestamp(hwe);
>  
>  	/*
>  	 * Wait for any exec queue going away: their cycles will get updated on
> @@ -324,11 +341,9 @@ static void show_run_ticks(struct drm_printer *p, struct drm_file *file)
>  	wait_var_event(&xef->exec_queue.pending_removal,
>  		       !atomic_read(&xef->exec_queue.pending_removal));
>  
> +	xe_force_wake_put(gt_to_fw(hwe->gt), fw_ref);
>  	xe_pm_runtime_put(xe);
>  
> -	if (unlikely(!hwe))
> -		return;
> -
>  	for (class = 0; class < XE_ENGINE_CLASS_MAX; class++) {
>  		const char *class_name;
>  
> -- 

I know I've written a lot of suggestions at this point, but none of them
are particularly blocking, so just clean up the commit message a bit and
this is

Reviewed-by: Jonathan Cavitt <jonathan.cavitt at intel.com>
-Jonathan Cavitt

> 2.47.0
> 
>