[PATCH v4 2/3] drm/xe: Sample gpu timestamp closer to exec queues
Cavitt, Jonathan
jonathan.cavitt at intel.com
Fri Nov 8 15:29:45 UTC 2024
-----Original Message-----
From: De Marchi, Lucas <lucas.demarchi at intel.com>
Sent: Thursday, November 7, 2024 9:33 PM
To: intel-xe at lists.freedesktop.org
Cc: Cavitt, Jonathan <jonathan.cavitt at intel.com>; Nerlige Ramappa, Umesh <umesh.nerlige.ramappa at intel.com>; Brost, Matthew <matthew.brost at intel.com>; De Marchi, Lucas <lucas.demarchi at intel.com>
Subject: [PATCH v4 2/3] drm/xe: Sample gpu timestamp closer to exec queues
>
> Move the force_wake_get to the beginning of the function so the gpu
> timestamp can be closer to the sampled value for exec queues. This
> avoids additional delays waiting for force wake ack which can make the
> proportion between cycles/total_cycles fluctuate around the real value.
Maybe add a newline here, but otherwise
Reviewed-by: Jonathan Cavitt <jonathan.cavitt at intel.com>
-Jonathan Cavitt
> For a gputop-like application getting 2 samples to calculate the
> utilization:
>
> sample 0:
> read_exec_queue_timestamp
> <<<< (A)
> read_gpu_timestamp
> sample 1:
> read_exec_queue_timestamp
> <<<<< (B)
> read_gpu_timestamp
>
> In the above case, utilization can be bigger or smaller than it should
> be, depending on if (A) or (B) receives additional delay, respectively.
>
> With this a LNL system that was failing on
> `xe_drm_fdinfo --r utilization-single-full-load` after ~60 iterations,
> get to run to 100 without a failure. This is still not perfect, and it's
> easy to introduce errors by just loading the CPU with `stress --cpu
> $(nproc)` - the same igt test in this case fails after 2 or 3
> iterations. That will be dealt with in the test itself, using a longer
> sampling period.
>
> v2: Rename function and add another to get "any engine", preparing for
> caching the hwe in future (Umesh / Jonathan)
>
> Signed-off-by: Lucas De Marchi <lucas.demarchi at intel.com>
> ---
> drivers/gpu/drm/xe/xe_drm_client.c | 73 ++++++++++++++++++++----------
> 1 file changed, 49 insertions(+), 24 deletions(-)
>
> diff --git a/drivers/gpu/drm/xe/xe_drm_client.c b/drivers/gpu/drm/xe/xe_drm_client.c
> index dd4e16a84874c..298a587da7f17 100644
> --- a/drivers/gpu/drm/xe/xe_drm_client.c
> +++ b/drivers/gpu/drm/xe/xe_drm_client.c
> @@ -269,6 +269,49 @@ static void show_meminfo(struct drm_printer *p, struct drm_file *file)
> }
> }
>
> +static struct xe_hw_engine *any_engine(struct xe_device *xe)
> +{
> + struct xe_gt *gt;
> + unsigned long gt_id;
> +
> + for_each_gt(gt, xe, gt_id) {
> + struct xe_hw_engine *hwe = xe_gt_any_hw_engine(gt);
> +
> + if (hwe)
> + return hwe;
> + }
> +
> + return NULL;
> +}
> +
> +static bool force_wake_get_any_engine(struct xe_device *xe,
> + struct xe_hw_engine **phwe,
> + unsigned int *pfw_ref)
> +{
> + enum xe_force_wake_domains domain;
> + unsigned int fw_ref;
> + struct xe_hw_engine *hwe;
> + struct xe_force_wake *fw;
> +
> + hwe = any_engine(xe);
> + if (!hwe)
> + return false;
> +
> + domain = xe_hw_engine_to_fw_domain(hwe);
> + fw = gt_to_fw(hwe->gt);
> +
> + fw_ref = xe_force_wake_get(fw, domain);
> + if (!xe_force_wake_ref_has_domain(fw_ref, domain)) {
> + xe_force_wake_put(fw, fw_ref);
> + return false;
> + }
> +
> + *phwe = hwe;
> + *pfw_ref = fw_ref;
> +
> + return true;
> +}
> +
> static void show_run_ticks(struct drm_printer *p, struct drm_file *file)
> {
> unsigned long class, i, gt_id, capacity[XE_ENGINE_CLASS_MAX] = { };
> @@ -288,6 +331,10 @@ static void show_run_ticks(struct drm_printer *p, struct drm_file *file)
> !atomic_read(&xef->exec_queue.pending_removal));
>
> xe_pm_runtime_get(xe);
> + if (!force_wake_get_any_engine(xe, &hwe, &fw_ref)) {
> + xe_pm_runtime_put(xe);
> + return;
> + }
>
> /* Accumulate all the exec queues from this client */
> mutex_lock(&xef->exec_queue.lock);
> @@ -302,33 +349,11 @@ static void show_run_ticks(struct drm_printer *p, struct drm_file *file)
> }
> mutex_unlock(&xef->exec_queue.lock);
>
> - /* Get the total GPU cycles */
> - for_each_gt(gt, xe, gt_id) {
> - enum xe_force_wake_domains fw;
> -
> - hwe = xe_gt_any_hw_engine(gt);
> - if (!hwe)
> - continue;
> -
> - fw = xe_hw_engine_to_fw_domain(hwe);
> -
> - fw_ref = xe_force_wake_get(gt_to_fw(gt), fw);
> - if (!xe_force_wake_ref_has_domain(fw_ref, fw)) {
> - hwe = NULL;
> - xe_force_wake_put(gt_to_fw(gt), fw_ref);
> - break;
> - }
> -
> - gpu_timestamp = xe_hw_engine_read_timestamp(hwe);
> - xe_force_wake_put(gt_to_fw(gt), fw_ref);
> - break;
> - }
> + gpu_timestamp = xe_hw_engine_read_timestamp(hwe);
>
> + xe_force_wake_put(gt_to_fw(hwe->gt), fw_ref);
> xe_pm_runtime_put(xe);
>
> - if (unlikely(!hwe))
> - return;
> -
> for (class = 0; class < XE_ENGINE_CLASS_MAX; class++) {
> const char *class_name;
>
> --
> 2.47.0
>
>
More information about the Intel-xe
mailing list