[Intel-xe] [PATCH v3 03/30] drm/xe: Correlate engine and cpu timestamps with better accuracy

Souza, Jose jose.souza at intel.com
Tue Sep 26 16:42:00 UTC 2023


On Tue, 2023-09-26 at 12:55 +0000, Francois Dugast wrote:
> From: Umesh Nerlige Ramappa <umesh.nerlige.ramappa at intel.com>
> 
> Perf measurements rely on CPU and engine timestamps to correlate
> events of interest across these time domains. Current mechanisms get
> these timestamps separately and the calculated delta between these
> timestamps lack enough accuracy.
> 
> To improve the accuracy of these time measurements to within a few us,
> add a query that returns the engine and cpu timestamps captured as
> close to each other as possible.
> 
> Prior work: https://patchwork.freedesktop.org/series/87552/

Reviewed-by: José Roberto de Souza <jose.souza at intel.com>

> 
> Signed-off-by: Umesh Nerlige Ramappa <umesh.nerlige.ramappa at intel.com>
> Signed-off-by: Francois Dugast <francois.dugast at intel.com>
> Signed-off-by: Rodrigo Vivi <rodrigo.vivi at intel.com>
> ---
>  drivers/gpu/drm/xe/xe_query.c | 141 ++++++++++++++++++++++++++++++++++
>  include/uapi/drm/xe_drm.h     |  95 ++++++++++++++++++-----
>  2 files changed, 218 insertions(+), 18 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_query.c b/drivers/gpu/drm/xe/xe_query.c
> index cbccd5c3dbc8..be9affd4f181 100644
> --- a/drivers/gpu/drm/xe/xe_query.c
> +++ b/drivers/gpu/drm/xe/xe_query.c
> @@ -6,10 +6,12 @@
>  #include "xe_query.h"
>  
>  #include <linux/nospec.h>
> +#include <linux/sched/clock.h>
>  
>  #include <drm/ttm/ttm_placement.h>
>  #include <drm/xe_drm.h>
>  
> +#include "regs/xe_engine_regs.h"
>  #include "xe_bo.h"
>  #include "xe_device.h"
>  #include "xe_exec_queue.h"
> @@ -17,6 +19,7 @@
>  #include "xe_gt.h"
>  #include "xe_guc_hwconfig.h"
>  #include "xe_macros.h"
> +#include "xe_mmio.h"
>  #include "xe_ttm_vram_mgr.h"
>  
>  static const u16 xe_to_user_engine_class[] = {
> @@ -27,6 +30,14 @@ static const u16 xe_to_user_engine_class[] = {
>  	[XE_ENGINE_CLASS_COMPUTE] = DRM_XE_ENGINE_CLASS_COMPUTE,
>  };
>  
> +static const enum xe_engine_class user_to_xe_engine_class[] = {
> +	[DRM_XE_ENGINE_CLASS_RENDER] = XE_ENGINE_CLASS_RENDER,
> +	[DRM_XE_ENGINE_CLASS_COPY] = XE_ENGINE_CLASS_COPY,
> +	[DRM_XE_ENGINE_CLASS_VIDEO_DECODE] = XE_ENGINE_CLASS_VIDEO_DECODE,
> +	[DRM_XE_ENGINE_CLASS_VIDEO_ENHANCE] = XE_ENGINE_CLASS_VIDEO_ENHANCE,
> +	[DRM_XE_ENGINE_CLASS_COMPUTE] = XE_ENGINE_CLASS_COMPUTE,
> +};
> +
>  static size_t calc_hw_engine_info_size(struct xe_device *xe)
>  {
>  	struct xe_hw_engine *hwe;
> @@ -45,6 +56,135 @@ static size_t calc_hw_engine_info_size(struct xe_device *xe)
>  	return i * sizeof(struct drm_xe_engine_class_instance);
>  }
>  
> +typedef u64 (*__ktime_func_t)(void);
> +static __ktime_func_t __clock_id_to_func(clockid_t clk_id)
> +{
> +	/*
> +	 * Use logic same as the perf subsystem to allow user to select the
> +	 * reference clock id to be used for timestamps.
> +	 */
> +	switch (clk_id) {
> +	case CLOCK_MONOTONIC:
> +		return &ktime_get_ns;
> +	case CLOCK_MONOTONIC_RAW:
> +		return &ktime_get_raw_ns;
> +	case CLOCK_REALTIME:
> +		return &ktime_get_real_ns;
> +	case CLOCK_BOOTTIME:
> +		return &ktime_get_boottime_ns;
> +	case CLOCK_TAI:
> +		return &ktime_get_clocktai_ns;
> +	default:
> +		return NULL;
> +	}
> +}
> +
> +static void
> +__read_timestamps(struct xe_gt *gt,
> +		  struct xe_reg lower_reg,
> +		  struct xe_reg upper_reg,
> +		  u64 *cs_ts,
> +		  u64 *cpu_ts,
> +		  u64 *cpu_delta,
> +		  __ktime_func_t cpu_clock)
> +{
> +	u32 upper, lower, old_upper, loop = 0;
> +
> +	upper = xe_mmio_read32(gt, upper_reg);
> +	do {
> +		*cpu_delta = local_clock();
> +		*cpu_ts = cpu_clock();
> +		lower = xe_mmio_read32(gt, lower_reg);
> +		*cpu_delta = local_clock() - *cpu_delta;
> +		old_upper = upper;
> +		upper = xe_mmio_read32(gt, upper_reg);
> +	} while (upper != old_upper && loop++ < 2);
> +
> +	*cs_ts = (u64)upper << 32 | lower;
> +}
> +
> +static int
> +query_cs_cycles(struct xe_device *xe,
> +		struct drm_xe_device_query *query)
> +{
> +	struct drm_xe_query_cs_cycles __user *query_ptr;
> +	struct drm_xe_engine_class_instance *eci;
> +	struct drm_xe_query_cs_cycles resp;
> +	size_t size = sizeof(resp);
> +	__ktime_func_t cpu_clock;
> +	struct xe_hw_engine *hwe;
> +	struct xe_gt *gt;
> +
> +	if (query->size == 0) {
> +		query->size = size;
> +		return 0;
> +	} else if (XE_IOCTL_DBG(xe, query->size != size)) {
> +		return -EINVAL;
> +	}
> +
> +	query_ptr = u64_to_user_ptr(query->data);
> +	if (copy_from_user(&resp, query_ptr, size))
> +		return -EFAULT;
> +
> +	if (resp.rsvd)
> +		return -EINVAL;
> +
> +	cpu_clock = __clock_id_to_func(resp.clockid);
> +	if (!cpu_clock)
> +		return -EINVAL;
> +
> +	eci = &resp.eci;
> +	if (eci->gt_id > XE_MAX_GT_PER_TILE)
> +		return -EINVAL;
> +
> +	gt = xe_device_get_gt(xe, eci->gt_id);
> +	if (!gt)
> +		return -EINVAL;
> +
> +	if (eci->engine_class >= ARRAY_SIZE(user_to_xe_engine_class))
> +		return -EINVAL;
> +
> +	hwe = xe_gt_hw_engine(gt, user_to_xe_engine_class[eci->engine_class],
> +			      eci->engine_instance, true);
> +	if (!hwe)
> +		return -EINVAL;
> +
> +	resp.cs_frequency = gt->info.clock_freq;
> +
> +	xe_device_mem_access_get(xe);
> +	xe_force_wake_get(gt_to_fw(gt), XE_FORCEWAKE_ALL);
> +
> +	__read_timestamps(gt,
> +			  RING_TIMESTAMP(hwe->mmio_base),
> +			  RING_TIMESTAMP_UDW(hwe->mmio_base),
> +			  &resp.cs_cycles,
> +			  &resp.cpu_timestamp,
> +			  &resp.cpu_delta,
> +			  cpu_clock);
> +
> +	xe_force_wake_put(gt_to_fw(gt), XE_FORCEWAKE_ALL);
> +	xe_device_mem_access_put(xe);
> +	resp.width = 36;
> +
> +	/* Only write to the output fields of user query */
> +	if (put_user(resp.cs_frequency, &query_ptr->cs_frequency))
> +		return -EFAULT;
> +
> +	if (put_user(resp.cpu_timestamp, &query_ptr->cpu_timestamp))
> +		return -EFAULT;
> +
> +	if (put_user(resp.cpu_delta, &query_ptr->cpu_delta))
> +		return -EFAULT;
> +
> +	if (put_user(resp.cs_cycles, &query_ptr->cs_cycles))
> +		return -EFAULT;
> +
> +	if (put_user(resp.width, &query_ptr->width))
> +		return -EFAULT;
> +
> +	return 0;
> +}
> +
>  static int query_engines(struct xe_device *xe,
>  			 struct drm_xe_device_query *query)
>  {
> @@ -369,6 +509,7 @@ static int (* const xe_query_funcs[])(struct xe_device *xe,
>  	query_gts,
>  	query_hwconfig,
>  	query_gt_topology,
> +	query_cs_cycles,
>  };
>  
>  int xe_query_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
> diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h
> index d48d8e3c898c..fde1378a60b9 100644
> --- a/include/uapi/drm/xe_drm.h
> +++ b/include/uapi/drm/xe_drm.h
> @@ -128,6 +128,24 @@ struct xe_user_extension {
>  #define DRM_IOCTL_XE_WAIT_USER_FENCE		DRM_IOWR(DRM_COMMAND_BASE + DRM_XE_WAIT_USER_FENCE, struct drm_xe_wait_user_fence)
>  #define DRM_IOCTL_XE_VM_MADVISE			 DRM_IOW(DRM_COMMAND_BASE + DRM_XE_VM_MADVISE, struct drm_xe_vm_madvise)
>  
> +/** struct drm_xe_engine_class_instance - instance of an engine class */
> +struct drm_xe_engine_class_instance {
> +#define DRM_XE_ENGINE_CLASS_RENDER		0
> +#define DRM_XE_ENGINE_CLASS_COPY		1
> +#define DRM_XE_ENGINE_CLASS_VIDEO_DECODE	2
> +#define DRM_XE_ENGINE_CLASS_VIDEO_ENHANCE	3
> +#define DRM_XE_ENGINE_CLASS_COMPUTE		4
> +	/*
> +	 * Kernel only class (not actual hardware engine class). Used for
> +	 * creating ordered queues of VM bind operations.
> +	 */
> +#define DRM_XE_ENGINE_CLASS_VM_BIND		5
> +	__u16 engine_class;
> +
> +	__u16 engine_instance;
> +	__u16 gt_id;
> +};
> +
>  /**
>   * enum drm_xe_memory_class - Supported memory classes.
>   */
> @@ -219,6 +237,64 @@ struct drm_xe_query_mem_region {
>  	__u64 reserved[6];
>  };
>  
> +/**
> + * struct drm_xe_query_cs_cycles - correlate CPU and GPU timestamps
> + *
> + * If a query is made with a struct drm_xe_device_query where .query
> + * is equal to DRM_XE_QUERY_CS_CYCLES, then the reply uses
> + * struct drm_xe_query_cs_cycles in .data.
> + *
> + * struct drm_xe_query_cs_cycles is allocated by the user and .data points to
> + * this allocated structure. The user must pass .eci and .clockid as inputs to
> + * this query. eci determines the engine and tile info required to fetch the
> + * relevant GPU timestamp. clockid is used to return the specific CPU
> + * timestamp.
> + *
> + * The query returns the command streamer cycles and the frequency that can
> + * be used to calculate the command streamer timestamp. In addition the
> + * query returns a set of cpu timestamps that indicate when the command
> + * streamer cycle count was captured.
> + */
> +struct drm_xe_query_cs_cycles {
> +	/** Engine for which command streamer cycles is queried. */
> +	struct drm_xe_engine_class_instance eci;
> +
> +	/** MBZ (pad eci to 64 bit) */
> +	__u16 rsvd;
> +
> +	/**
> +	 * Command streamer cycles as read from the command streamer
> +	 * register at 0x358 offset.
> +	 */
> +	__u64 cs_cycles;
> +
> +	/** Frequency of the cs cycles in Hz. */
> +	__u64 cs_frequency;
> +
> +	/**
> +	 * CPU timestamp in ns. The timestamp is captured before reading the
> +	 * cs_cycles register using the reference clockid set by the user.
> +	 */
> +	__u64 cpu_timestamp;
> +
> +	/**
> +	 * Time delta in ns captured around reading the lower dword of the
> +	 * cs_cycles register.
> +	 */
> +	__u64 cpu_delta;
> +
> +	/**
> +	 * Reference clock id for CPU timestamp. For definition, see
> +	 * clock_gettime(2) and perf_event_open(2). Supported clock ids are
> +	 * CLOCK_MONOTONIC, CLOCK_MONOTONIC_RAW, CLOCK_REALTIME, CLOCK_BOOTTIME,
> +	 * CLOCK_TAI.
> +	 */
> +	__s32 clockid;
> +
> +	/** Width of the cs cycle counter in bits. */
> +	__u32 width;
> +};
> +
>  /**
>   * struct drm_xe_query_mem_usage - describe memory regions and usage
>   *
> @@ -391,6 +467,7 @@ struct drm_xe_device_query {
>  #define DRM_XE_DEVICE_QUERY_GTS		3
>  #define DRM_XE_DEVICE_QUERY_HWCONFIG	4
>  #define DRM_XE_DEVICE_QUERY_GT_TOPOLOGY	5
> +#define DRM_XE_QUERY_CS_CYCLES		6
>  	/** @query: The type of data to query */
>  	__u32 query;
>  
> @@ -732,24 +809,6 @@ struct drm_xe_exec_queue_set_property {
>  	__u64 reserved[2];
>  };
>  
> -/** struct drm_xe_engine_class_instance - instance of an engine class */
> -struct drm_xe_engine_class_instance {
> -#define DRM_XE_ENGINE_CLASS_RENDER		0
> -#define DRM_XE_ENGINE_CLASS_COPY		1
> -#define DRM_XE_ENGINE_CLASS_VIDEO_DECODE	2
> -#define DRM_XE_ENGINE_CLASS_VIDEO_ENHANCE	3
> -#define DRM_XE_ENGINE_CLASS_COMPUTE		4
> -	/*
> -	 * Kernel only class (not actual hardware engine class). Used for
> -	 * creating ordered queues of VM bind operations.
> -	 */
> -#define DRM_XE_ENGINE_CLASS_VM_BIND		5
> -	__u16 engine_class;
> -
> -	__u16 engine_instance;
> -	__u16 gt_id;
> -};
> -
>  struct drm_xe_exec_queue_create {
>  #define XE_EXEC_QUEUE_EXTENSION_SET_PROPERTY               0
>  	/** @extensions: Pointer to the first extension struct, if any */



More information about the Intel-xe mailing list