[PATCH 6/7] drm/xe/client: Print runtime to fdinfo

Tue Apr 16 23:20:54 UTC 2024

On Mon, Apr 15, 2024 at 08:04:53PM -0700, Lucas De Marchi wrote:
>Print the accumulated runtime, per client, when printing fdinfo.
>Each time a query is done it first does 2 things:
>
>1) loop through all the exec queues for the current client and
>   accumulates the runtime, per engine class. CTX_TIMESTAMP is used for
>   that, being read from the context image.
>
>2) Read a "GPU timestamp" that can be used for considering "how much GPU
>   time has passed" and that has the same unit/ref-clock as the one
>   recording the runtime. RING_TIMESTAMP is used for that via MMIO.
>
>This second part is done once per engine class, since it's a register
>that is replicated on all engines. It is however the same stamp. At
>least for the current GPUs this was tested one. It may be simplified,
>but in order to play safe and avoid the cases the clock is different in
>future for primary/media GTs, or across engine classes, just read it per
>class.
>
>This is exported to userspace as 2 numbers in fdinfo:
>
>	drm-engine-<class>: <GPU_TIMESTAMP> <RUNTIME> ticks
>
>Userspace is expected to collect at least 2 samples, which allows to
>know the engine busyness as per:
>
>		    RUNTIME1 - RUNTIME0
>	busyness = ---------------------
>			  T1 - T0
>
>When calculating the overall system busyness, userspace can loop through
>all the clients and add up all the numbers.  Since the GPU timestamp
>will be a little bit different, it's expected some fluctuation on
>accuracy, but that may be improved with a better hardware/GuC interface
>in future, maintaining the UAPI.
>
>Another thing to point out is that it's expected that userspace reads
>any 2 samples every few seconds.  Given the update frequency of the
>counters involved and that CTX_TIMESTAMP is 32b, it is expect to wrap
>every 25 ~ 30 seconds.

I think the worker I had implemented in my original patches (as well as 
i915 implementation) may have been confusing.

32 bit counter ticking at 19.2 MHz should wrap around in 223.696 
seconds. The worker on the other hand was overcompensating for it and 
running at 1/8th of this wrap around time.

> This could be mitigated by adding a workqueue to
>accumulate the counters every so often, but it's additional complexity
>for something that is done already by userspace every few seconds in
>tools like gputop (from igt), htop, nvtop, etc.
>
>Test-with: https://lore.kernel.org/igt-dev/20240405060056.59379-1-lucas.demarchi@intel.com/
>Signed-off-by: Lucas De Marchi <lucas.demarchi at intel.com>
>---
> drivers/gpu/drm/xe/xe_drm_client.c | 81 +++++++++++++++++++++++++++++-
> 1 file changed, 80 insertions(+), 1 deletion(-)
>
>diff --git a/drivers/gpu/drm/xe/xe_drm_client.c b/drivers/gpu/drm/xe/xe_drm_client.c
>index 08f0b7c95901..79eb453bfb14 100644
>--- a/drivers/gpu/drm/xe/xe_drm_client.c
>+++ b/drivers/gpu/drm/xe/xe_drm_client.c
>@@ -2,6 +2,7 @@
> /*
>  * Copyright © 2023 Intel Corporation
>  */
>+#include "xe_drm_client.h"
>
> #include <drm/drm_print.h>
> #include <drm/xe_drm.h>
>@@ -12,7 +13,10 @@
> #include "xe_bo.h"
> #include "xe_bo_types.h"
> #include "xe_device_types.h"
>-#include "xe_drm_client.h"
>+#include "xe_exec_queue.h"
>+#include "xe_gt.h"
>+#include "xe_hw_engine.h"
>+#include "xe_pm.h"
> #include "xe_trace.h"
>
> /**
>@@ -179,6 +183,80 @@ static void show_meminfo(struct drm_printer *p, struct drm_file *file)
> 	}
> }
>
>+static const u64 class_to_mask[] = {
>+        [XE_ENGINE_CLASS_RENDER] = XE_HW_ENGINE_RCS_MASK,
>+        [XE_ENGINE_CLASS_VIDEO_DECODE] = XE_HW_ENGINE_VCS_MASK,
>+        [XE_ENGINE_CLASS_VIDEO_ENHANCE] = XE_HW_ENGINE_VECS_MASK,
>+        [XE_ENGINE_CLASS_COPY] = XE_HW_ENGINE_BCS_MASK,
>+        [XE_ENGINE_CLASS_OTHER] = XE_HW_ENGINE_GSCCS_MASK,
>+        [XE_ENGINE_CLASS_COMPUTE] = XE_HW_ENGINE_CCS_MASK,
>+};
>+
>+static void show_runtime(struct drm_printer *p, struct drm_file *file)
>+{
>+	struct xe_file *xef = file->driver_priv;
>+	struct xe_device *xe = xef->xe;
>+	struct xe_gt *gt;
>+	struct xe_hw_engine *hwe;
>+	struct xe_exec_queue *q;
>+	unsigned long i, id_hwe, id_gt, capacity[XE_ENGINE_CLASS_MAX] = { };
>+	u64 gpu_timestamp, engine_mask = 0;
>+	bool gpu_stamp = false;
>+
>+	xe_pm_runtime_get(xe);
>+
>+	mutex_lock(&xef->exec_queue.lock);
>+	xa_for_each(&xef->exec_queue.xa, i, q)
>+		xe_exec_queue_update_runtime(q);
>+	mutex_unlock(&xef->exec_queue.lock);
>+
>+	for_each_gt(gt, xe, id_gt)
>+		engine_mask |= gt->info.engine_mask;
>+
>+	BUILD_BUG_ON(ARRAY_SIZE(class_to_mask) != XE_ENGINE_CLASS_MAX);
>+	for (i = 0; i < XE_ENGINE_CLASS_MAX; i++)
>+		capacity[i] = hweight64(engine_mask & class_to_mask[i]);
>+
>+	/*
>+	 * Iterate over all engines, printing the accumulated
>+	 * runtime for this xef per engine class
>+	 */
>+	for_each_gt(gt, xe, id_gt) {
>+		xe_force_wake_get(gt_to_fw(gt), XE_FW_GT);
>+		for_each_hw_engine(hwe, gt, id_hwe) {
>+			const char *class_name;
>+
>+			if (!capacity[hwe->class])
>+				continue;
>+
>+			/*
>+			 * Use any (first) engine to have a timestamp to be used every
>+			 * time
>+			 */
>+			if (!gpu_stamp) {

gpu_stamp is not required if the intention is to read the timestamp once 
per class (as per commit message) because you are already setting 
capacity to 0 below.

Umesh

>+				gpu_timestamp = xe_hw_engine_read_timestamp(hwe);
>+				gpu_stamp = true;
>+			}
>+
>+			class_name = xe_hw_engine_class_to_str(hwe->class);
>+
>+			drm_printf(p, "drm-engine-%s:\t%llu %llu ticks\n",
>+				   class_name, gpu_timestamp,
>+				   xef->runtime[hwe->class]);
>+
>+			if (capacity[hwe->class] > 1)
>+				drm_printf(p, "drm-engine-capacity-%s:\t%lu\n",
>+					   class_name, capacity[hwe->class]);
>+
>+			/* engine class already handled, skip next iterations */
>+			capacity[hwe->class] = 0;
>+		}
>+		xe_force_wake_put(gt_to_fw(gt), XE_FW_GT);
>+	}
>+
>+	xe_pm_runtime_get(xe);
>+}
>+
> /**
>  * xe_drm_client_fdinfo() - Callback for fdinfo interface
>  * @p: The drm_printer ptr
>@@ -192,5 +270,6 @@ static void show_meminfo(struct drm_printer *p, struct drm_file *file)
> void xe_drm_client_fdinfo(struct drm_printer *p, struct drm_file *file)
> {
> 	show_meminfo(p, file);
>+	show_runtime(p, file);
> }
> #endif
>-- 
>2.43.0
>