[Intel-xe] [PATCH 3/3] drm/xe: Correlate engine and cpu timestamps with better accuracy
Umesh Nerlige Ramappa
umesh.nerlige.ramappa at intel.com
Wed Aug 9 20:18:21 UTC 2023
On Wed, Aug 09, 2023 at 12:55:37PM -0700, Souza, Jose wrote:
>On Fri, 2023-08-04 at 14:32 -0700, Umesh Nerlige Ramappa wrote:
>> Perf measurements rely on CPU and engine timestamps to correlate
>> events of interest across these time domains. Current mechanisms get
>> these timestamps separately and the calculated delta between these
>> timestamps lack enough accuracy.
>>
>> To improve the accuracy of these time measurements to within a few us,
>> add a query that returns the engine and cpu timestamps captured as
>> close to each other as possible.
>>
>> Prior work: https://patchwork.freedesktop.org/series/87552/
>>
>> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa at intel.com>
>> ---
>> drivers/gpu/drm/xe/xe_query.c | 141 ++++++++++++++++++++++++++++++++++
>> include/uapi/drm/xe_drm.h | 95 ++++++++++++++++++-----
>> 2 files changed, 218 insertions(+), 18 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/xe/xe_query.c b/drivers/gpu/drm/xe/xe_query.c
>> index 99a4800c7c53..ae1e810267a7 100644
>> --- a/drivers/gpu/drm/xe/xe_query.c
>> +++ b/drivers/gpu/drm/xe/xe_query.c
>> @@ -6,10 +6,12 @@
>> #include "xe_query.h"
>>
>> #include <linux/nospec.h>
>> +#include <linux/sched/clock.h>
>>
>> #include <drm/ttm/ttm_placement.h>
>> #include <drm/xe_drm.h>
>>
>> +#include "regs/xe_engine_regs.h"
>> #include "xe_bo.h"
>> #include "xe_device.h"
>> #include "xe_exec_queue.h"
>> @@ -17,6 +19,7 @@
>> #include "xe_gt.h"
>> #include "xe_guc_hwconfig.h"
>> #include "xe_macros.h"
>> +#include "xe_mmio.h"
>> #include "xe_ttm_vram_mgr.h"
>>
>> static const u16 xe_to_user_engine_class[] = {
>> @@ -27,6 +30,14 @@ static const u16 xe_to_user_engine_class[] = {
>> [XE_ENGINE_CLASS_COMPUTE] = DRM_XE_ENGINE_CLASS_COMPUTE,
>> };
>>
>> +static const enum xe_engine_class user_to_xe_engine_class[] = {
>> + [DRM_XE_ENGINE_CLASS_RENDER] = XE_ENGINE_CLASS_RENDER,
>> + [DRM_XE_ENGINE_CLASS_COPY] = XE_ENGINE_CLASS_COPY,
>> + [DRM_XE_ENGINE_CLASS_VIDEO_DECODE] = XE_ENGINE_CLASS_VIDEO_DECODE,
>> + [DRM_XE_ENGINE_CLASS_VIDEO_ENHANCE] = XE_ENGINE_CLASS_VIDEO_ENHANCE,
>> + [DRM_XE_ENGINE_CLASS_COMPUTE] = XE_ENGINE_CLASS_COMPUTE,
>> +};
>> +
>> static size_t calc_hw_engine_info_size(struct xe_device *xe)
>> {
>> struct xe_hw_engine *hwe;
>> @@ -45,6 +56,135 @@ static size_t calc_hw_engine_info_size(struct xe_device *xe)
>> return i * sizeof(struct drm_xe_engine_class_instance);
>> }
>>
>> +typedef u64 (*__ktime_func_t)(void);
>> +static __ktime_func_t __clock_id_to_func(clockid_t clk_id)
>> +{
>> + /*
>> + * Use logic same as the perf subsystem to allow user to select the
>> + * reference clock id to be used for timestamps.
>> + */
>> + switch (clk_id) {
>> + case CLOCK_MONOTONIC:
>> + return &ktime_get_ns;
>> + case CLOCK_MONOTONIC_RAW:
>> + return &ktime_get_raw_ns;
>> + case CLOCK_REALTIME:
>> + return &ktime_get_real_ns;
>> + case CLOCK_BOOTTIME:
>> + return &ktime_get_boottime_ns;
>> + case CLOCK_TAI:
>> + return &ktime_get_clocktai_ns;
>> + default:
>> + return NULL;
>> + }
>> +}
>> +
>> +static void
>> +__read_timestamps(struct xe_gt *gt,
>> + struct xe_reg lower_reg,
>> + struct xe_reg upper_reg,
>> + u64 *cs_ts,
>> + u64 *cpu_ts,
>> + u64 *cpu_delta,
>> + __ktime_func_t cpu_clock)
>> +{
>> + u32 upper, lower, old_upper, loop = 0;
>> +
>> + upper = xe_mmio_read32(gt, upper_reg);
>> + do {
>> + *cpu_delta = local_clock();
>> + *cpu_ts = cpu_clock();
>> + lower = xe_mmio_read32(gt, lower_reg);
>> + *cpu_delta = local_clock() - *cpu_delta;
>> + old_upper = upper;
>> + upper = xe_mmio_read32(gt, upper_reg);
>> + } while (upper != old_upper && loop++ < 2);
>> +
>> + *cs_ts = (u64)upper << 32 | lower;
>> +}
>> +
>> +static int
>> +query_cs_cycles(struct xe_device *xe,
>> + struct drm_xe_device_query *query)
>> +{
>> + struct drm_xe_query_cs_cycles __user *query_ptr;
>> + struct drm_xe_engine_class_instance *eci;
>> + struct drm_xe_query_cs_cycles resp;
>> + size_t size = sizeof(resp);
>> + __ktime_func_t cpu_clock;
>> + struct xe_hw_engine *hwe;
>> + struct xe_gt *gt;
>> +
>> + if (query->size == 0) {
>> + query->size = size;
>> + return 0;
>> + } else if (XE_IOCTL_DBG(xe, query->size != size)) {
>> + return -EINVAL;
>> + }
>> +
>> + query_ptr = u64_to_user_ptr(query->data);
>> + if (copy_from_user(&resp, query_ptr, size))
>> + return -EFAULT;
>> +
>> + if (resp.rsvd)
>> + return -EINVAL;
>> +
>> + cpu_clock = __clock_id_to_func(resp.clockid);
>> + if (!cpu_clock)
>> + return -EINVAL;
>> +
>> + eci = &resp.eci;
>> + if (eci->gt_id > XE_MAX_GT_PER_TILE)
>> + return -EINVAL;
>> +
>> + gt = xe_device_get_gt(xe, eci->gt_id);
>> + if (!gt)
>> + return -EINVAL;
>> +
>> + if (eci->engine_class >= ARRAY_SIZE(user_to_xe_engine_class))
>> + return -EINVAL;
>> +
>> + hwe = xe_gt_hw_engine(gt, user_to_xe_engine_class[eci->engine_class],
>> + eci->engine_instance, true);
>> + if (!hwe)
>> + return -EINVAL;
>> +
>> + resp.cs_frequency = gt->info.clock_freq;
>> +
>> + xe_device_mem_access_get(xe);
>> + xe_force_wake_get(gt_to_fw(gt), XE_FORCEWAKE_ALL);
>> +
>> + __read_timestamps(gt,
>> + RING_TIMESTAMP(hwe->mmio_base),
>> + RING_TIMESTAMP_UDW(hwe->mmio_base),
>> + &resp.cs_cycles,
>> + &resp.cpu_timestamp,
>> + &resp.cpu_delta,
>> + cpu_clock);
>> +
>> + xe_force_wake_put(gt_to_fw(gt), XE_FORCEWAKE_ALL);
>> + xe_device_mem_access_put(xe);
>> + resp.width = 36;
>> +
>> + /* Only write to the output fields of user query */
>> + if (put_user(resp.cs_frequency, &query_ptr->cs_frequency))
>> + return -EFAULT;
>> +
>> + if (put_user(resp.cpu_timestamp, &query_ptr->cpu_timestamp))
>> + return -EFAULT;
>> +
>> + if (put_user(resp.cpu_delta, &query_ptr->cpu_delta))
>> + return -EFAULT;
>> +
>> + if (put_user(resp.cs_cycles, &query_ptr->cs_cycles))
>> + return -EFAULT;
>> +
>> + if (put_user(resp.width, &query_ptr->width))
>> + return -EFAULT;
>> +
>> + return 0;
>> +}
>> +
>> static int query_engines(struct xe_device *xe,
>> struct drm_xe_device_query *query)
>> {
>> @@ -373,6 +513,7 @@ static int (* const xe_query_funcs[])(struct xe_device *xe,
>> query_gts,
>> query_hwconfig,
>> query_gt_topology,
>> + query_cs_cycles,
>> };
>>
>> int xe_query_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>> diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h
>> index 86f16d50e9cc..a5a5a02253f2 100644
>> --- a/include/uapi/drm/xe_drm.h
>> +++ b/include/uapi/drm/xe_drm.h
>> @@ -128,6 +128,24 @@ struct xe_user_extension {
>> #define DRM_IOCTL_XE_WAIT_USER_FENCE DRM_IOWR(DRM_COMMAND_BASE + DRM_XE_WAIT_USER_FENCE, struct drm_xe_wait_user_fence)
>> #define DRM_IOCTL_XE_VM_MADVISE DRM_IOW(DRM_COMMAND_BASE + DRM_XE_VM_MADVISE, struct drm_xe_vm_madvise)
>>
>> +/** struct drm_xe_engine_class_instance - instance of an engine class */
>> +struct drm_xe_engine_class_instance {
>> +#define DRM_XE_ENGINE_CLASS_RENDER 0
>> +#define DRM_XE_ENGINE_CLASS_COPY 1
>> +#define DRM_XE_ENGINE_CLASS_VIDEO_DECODE 2
>> +#define DRM_XE_ENGINE_CLASS_VIDEO_ENHANCE 3
>> +#define DRM_XE_ENGINE_CLASS_COMPUTE 4
>> + /*
>> + * Kernel only class (not actual hardware engine class). Used for
>> + * creating ordered queues of VM bind operations.
>> + */
>> +#define DRM_XE_ENGINE_CLASS_VM_BIND 5
>> + __u16 engine_class;
>> +
>> + __u16 engine_instance;
>> + __u16 gt_id;
>> +};
>> +
>> /**
>> * enum drm_xe_memory_class - Supported memory classes.
>> */
>> @@ -223,6 +241,64 @@ struct drm_xe_query_mem_region {
>> __u64 reserved[6];
>> };
>>
>> +/**
>> + * struct drm_xe_query_cs_cycles - correlate CPU and GPU timestamps
>> + *
>> + * If a query is made with a struct drm_xe_device_query where .query
>> + * is equal to DRM_XE_QUERY_CS_CYCLES, then the reply uses
>> + * struct drm_xe_query_cs_cycles in .data.
>> + *
>> + * struct drm_xe_query_cs_cycles is allocated by the user and .data points to
>> + * this allocated structure. The user must pass .eci and .clockid as inputs to
>> + * this query. eci determines the engine and tile info required to fetch the
>> + * relevant GPU timestamp. clockid is used to return the specific CPU
>> + * timestamp.
>> + *
>> + * The query returns the command streamer cycles and the frequency that can
>> + * be used to calculate the command streamer timestamp. In addition the
>> + * query returns a set of cpu timestamps that indicate when the command
>> + * streamer cycle count was captured.
>> + */
>> +struct drm_xe_query_cs_cycles {
>> + /** Engine for which command streamer cycles is queried. */
>> + struct drm_xe_engine_class_instance eci;
>> +
>> + /** MBZ (pad eci to 64 bit) */
>> + __u16 rsvd;
>> +
>> + /**
>> + * Command streamer cycles as read from the command streamer
>> + * register at 0x358 offset.
>> + */
>> + __u64 cs_cycles;
>> +
>> + /** Frequency of the cs cycles in Hz. */
>> + __u64 cs_frequency;
>> +
>> + /**
>> + * CPU timestamp in ns. The timestamp is captured before reading the
>> + * cs_cycles register using the reference clockid set by the user.
>> + */
>> + __u64 cpu_timestamp;
>> +
>> + /**
>> + * Time delta in ns captured around reading the lower dword of the
>> + * cs_cycles register.
>> + */
>> + __u64 cpu_delta;
>> +
>> + /**
>> + * Reference clock id for CPU timestamp. For definition, see
>> + * clock_gettime(2) and perf_event_open(2). Supported clock ids are
>> + * CLOCK_MONOTONIC, CLOCK_MONOTONIC_RAW, CLOCK_REALTIME, CLOCK_BOOTTIME,
>> + * CLOCK_TAI.
>> + */
>> + __s32 clockid;
>> +
>> + /** Width of the cs cycle counter in bits. */
>> + __u32 width;
>
>Not much clear what are the 'inputs' and 'outputs' of this struct without reading the implementation...
>Would be better adjust the comments to help with that. Also you could change the order to make the 'inputs' first:
The input/output is documented in the structure description rather than
the individual members. I guess it's better to add it to the individual
members.
>
>drm_xe_query_cs_cycles {
>eci
>rsvd
>clockid
>
>width
>cs_cycles
>cs_frequency
>cpu_timestamp
>cpu_delta
>}
Sure, I can reorder the fields. Not sure if that affects alignment in
any way. Will check.
>
>Also maybe rename cs to engine, command streamer or cs is not used in xe_drm.h.
will do
>
>Other than that LGTM.
>Mesa side is implemented here: https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24591
Thanks, I will include this link in the commit msg.
Umesh
>
>
>> +};
>> +
>> /**
>> * struct drm_xe_query_mem_usage - describe memory regions and usage
>> *
>> @@ -395,6 +471,7 @@ struct drm_xe_device_query {
>> #define DRM_XE_DEVICE_QUERY_GTS 3
>> #define DRM_XE_DEVICE_QUERY_HWCONFIG 4
>> #define DRM_XE_DEVICE_QUERY_GT_TOPOLOGY 5
>> +#define DRM_XE_QUERY_CS_CYCLES 6
>> /** @query: The type of data to query */
>> __u32 query;
>>
>> @@ -737,24 +814,6 @@ struct drm_xe_exec_queue_set_property {
>> __u64 reserved[2];
>> };
>>
>> -/** struct drm_xe_engine_class_instance - instance of an engine class */
>> -struct drm_xe_engine_class_instance {
>> -#define DRM_XE_ENGINE_CLASS_RENDER 0
>> -#define DRM_XE_ENGINE_CLASS_COPY 1
>> -#define DRM_XE_ENGINE_CLASS_VIDEO_DECODE 2
>> -#define DRM_XE_ENGINE_CLASS_VIDEO_ENHANCE 3
>> -#define DRM_XE_ENGINE_CLASS_COMPUTE 4
>> - /*
>> - * Kernel only class (not actual hardware engine class). Used for
>> - * creating ordered queues of VM bind operations.
>> - */
>> -#define DRM_XE_ENGINE_CLASS_VM_BIND 5
>> - __u16 engine_class;
>> -
>> - __u16 engine_instance;
>> - __u16 gt_id;
>> -};
>> -
>> struct drm_xe_exec_queue_create {
>> #define XE_EXEC_QUEUE_EXTENSION_SET_PROPERTY 0
>> /** @extensions: Pointer to the first extension struct, if any */
>
More information about the Intel-xe
mailing list