[PATCH v4 2/3] drm/xe: Sample gpu timestamp closer to exec queues

Fri Nov 8 15:29:45 UTC 2024

-----Original Message-----
From: De Marchi, Lucas <lucas.demarchi at intel.com> 
Sent: Thursday, November 7, 2024 9:33 PM
To: intel-xe at lists.freedesktop.org
Cc: Cavitt, Jonathan <jonathan.cavitt at intel.com>; Nerlige Ramappa, Umesh <umesh.nerlige.ramappa at intel.com>; Brost, Matthew <matthew.brost at intel.com>; De Marchi, Lucas <lucas.demarchi at intel.com>
Subject: [PATCH v4 2/3] drm/xe: Sample gpu timestamp closer to exec queues
> 
> Move the force_wake_get to the beginning of the function so the gpu
> timestamp can be closer to the sampled value for exec queues. This
> avoids additional delays waiting for force wake ack which can make the
> proportion between cycles/total_cycles fluctuate around the real value.

Maybe add a newline here, but otherwise
Reviewed-by: Jonathan Cavitt <jonathan.cavitt at intel.com>
-Jonathan Cavitt

> For a gputop-like application getting 2 samples to calculate the
> utilization:
> 
> 	sample 0:
> 		read_exec_queue_timestamp
> 					<<<< (A)
> 		read_gpu_timestamp
> 	sample 1:
> 		read_exec_queue_timestamp
> 					<<<<< (B)
> 		read_gpu_timestamp
> 
> In the above case, utilization can be bigger or smaller than it should
> be, depending on if (A) or (B) receives additional delay, respectively.
> 
> With this a LNL system that was failing on
> `xe_drm_fdinfo --r utilization-single-full-load` after ~60 iterations,
> get to run to 100 without a failure. This is still not perfect, and it's
> easy to introduce errors by just loading the CPU with `stress --cpu
> $(nproc)` - the same igt test in this case fails after 2 or 3
> iterations. That will be dealt with in the test itself, using a longer
> sampling period.
> 
> v2: Rename function and add another to get "any engine", preparing for
>     caching the hwe in future (Umesh / Jonathan)
> 
> Signed-off-by: Lucas De Marchi <lucas.demarchi at intel.com>
> ---
>  drivers/gpu/drm/xe/xe_drm_client.c | 73 ++++++++++++++++++++----------
>  1 file changed, 49 insertions(+), 24 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_drm_client.c b/drivers/gpu/drm/xe/xe_drm_client.c
> index dd4e16a84874c..298a587da7f17 100644
> --- a/drivers/gpu/drm/xe/xe_drm_client.c
> +++ b/drivers/gpu/drm/xe/xe_drm_client.c
> @@ -269,6 +269,49 @@ static void show_meminfo(struct drm_printer *p, struct drm_file *file)
>  	}
>  }
>  
> +static struct xe_hw_engine *any_engine(struct xe_device *xe)
> +{
> +	struct xe_gt *gt;
> +	unsigned long gt_id;
> +
> +	for_each_gt(gt, xe, gt_id) {
> +		struct xe_hw_engine *hwe = xe_gt_any_hw_engine(gt);
> +
> +		if (hwe)
> +			return hwe;
> +	}
> +
> +	return NULL;
> +}
> +
> +static bool force_wake_get_any_engine(struct xe_device *xe,
> +				      struct xe_hw_engine **phwe,
> +				      unsigned int *pfw_ref)
> +{
> +	enum xe_force_wake_domains domain;
> +	unsigned int fw_ref;
> +	struct xe_hw_engine *hwe;
> +	struct xe_force_wake *fw;
> +
> +	hwe = any_engine(xe);
> +	if (!hwe)
> +		return false;
> +
> +	domain = xe_hw_engine_to_fw_domain(hwe);
> +	fw = gt_to_fw(hwe->gt);
> +
> +	fw_ref = xe_force_wake_get(fw, domain);
> +	if (!xe_force_wake_ref_has_domain(fw_ref, domain)) {
> +		xe_force_wake_put(fw, fw_ref);
> +		return false;
> +	}
> +
> +	*phwe = hwe;
> +	*pfw_ref = fw_ref;
> +
> +	return true;
> +}
> +
>  static void show_run_ticks(struct drm_printer *p, struct drm_file *file)
>  {
>  	unsigned long class, i, gt_id, capacity[XE_ENGINE_CLASS_MAX] = { };
> @@ -288,6 +331,10 @@ static void show_run_ticks(struct drm_printer *p, struct drm_file *file)
>  		       !atomic_read(&xef->exec_queue.pending_removal));
>  
>  	xe_pm_runtime_get(xe);
> +	if (!force_wake_get_any_engine(xe, &hwe, &fw_ref)) {
> +		xe_pm_runtime_put(xe);
> +		return;
> +	}
>  
>  	/* Accumulate all the exec queues from this client */
>  	mutex_lock(&xef->exec_queue.lock);
> @@ -302,33 +349,11 @@ static void show_run_ticks(struct drm_printer *p, struct drm_file *file)
>  	}
>  	mutex_unlock(&xef->exec_queue.lock);
>  
> -	/* Get the total GPU cycles */
> -	for_each_gt(gt, xe, gt_id) {
> -		enum xe_force_wake_domains fw;
> -
> -		hwe = xe_gt_any_hw_engine(gt);
> -		if (!hwe)
> -			continue;
> -
> -		fw = xe_hw_engine_to_fw_domain(hwe);
> -
> -		fw_ref = xe_force_wake_get(gt_to_fw(gt), fw);
> -		if (!xe_force_wake_ref_has_domain(fw_ref, fw)) {
> -			hwe = NULL;
> -			xe_force_wake_put(gt_to_fw(gt), fw_ref);
> -			break;
> -		}
> -
> -		gpu_timestamp = xe_hw_engine_read_timestamp(hwe);
> -		xe_force_wake_put(gt_to_fw(gt), fw_ref);
> -		break;
> -	}
> +	gpu_timestamp = xe_hw_engine_read_timestamp(hwe);
>  
> +	xe_force_wake_put(gt_to_fw(hwe->gt), fw_ref);
>  	xe_pm_runtime_put(xe);
>  
> -	if (unlikely(!hwe))
> -		return;
> -
>  	for (class = 0; class < XE_ENGINE_CLASS_MAX; class++) {
>  		const char *class_name;
>  
> -- 
> 2.47.0
> 
>