[PATCH] drm/v3d: Expose performance counters to userspace

Melissa Wen melissa.srw at gmail.com
Mon Jun 14 16:19:25 UTC 2021


On 06/08, Juan A. Suarez Romero wrote:
> The V3D engine has several hardware performance counters that can of
> interest for userspace performance analysis tools.
> 
> This exposes new ioctls to create and destroy performance monitor
> objects, as well as to query the counter values.
> 
> Each created performance monitor object has an ID that can be attached
> to CL/CSD submissions, so the driver enables the requested counters when
> the job is submitted, and updates the performance monitor values when
> the job is done.
> 
> It is up to the user to ensure all the jobs have been finished before
> getting the performance monitor values. It is also up to the user to
> properly synchronize BCL jobs when submitting jobs with different
> performance monitors attached.
> 
> Cc: Daniel Vetter <daniel at ffwll.ch>
> Cc: David Airlie <airlied at linux.ie>
> Cc: Emma Anholt <emma at anholt.net>
> To: dri-devel at lists.freedesktop.org
> Signed-off-by: Juan A. Suarez Romero <jasuarez at igalia.com>

Hi Juan,

I've checked it (+ mesa MR) on glxgears, and lgtm.

+ some basic tests from igt, by default.

Acked-by: Melissa Wen <mwen at igalia.com>

> ---
>  drivers/gpu/drm/v3d/Makefile      |   1 +
>  drivers/gpu/drm/v3d/v3d_drv.c     |   8 ++
>  drivers/gpu/drm/v3d/v3d_drv.h     |  63 +++++++++
>  drivers/gpu/drm/v3d/v3d_gem.c     |  31 +++++
>  drivers/gpu/drm/v3d/v3d_perfmon.c | 213 ++++++++++++++++++++++++++++++
>  drivers/gpu/drm/v3d/v3d_regs.h    |   2 +
>  drivers/gpu/drm/v3d/v3d_sched.c   |  16 +++
>  include/uapi/drm/v3d_drm.h        | 136 +++++++++++++++++++
>  8 files changed, 470 insertions(+)
>  create mode 100644 drivers/gpu/drm/v3d/v3d_perfmon.c
> 
> diff --git a/drivers/gpu/drm/v3d/Makefile b/drivers/gpu/drm/v3d/Makefile
> index db4cfc155821..e8b314137020 100644
> --- a/drivers/gpu/drm/v3d/Makefile
> +++ b/drivers/gpu/drm/v3d/Makefile
> @@ -9,6 +9,7 @@ v3d-y := \
>  	v3d_gem.o \
>  	v3d_irq.o \
>  	v3d_mmu.o \
> +	v3d_perfmon.o \
>  	v3d_trace_points.o \
>  	v3d_sched.o
>  
> diff --git a/drivers/gpu/drm/v3d/v3d_drv.c b/drivers/gpu/drm/v3d/v3d_drv.c
> index 99e22beea90b..9403c3b36aca 100644
> --- a/drivers/gpu/drm/v3d/v3d_drv.c
> +++ b/drivers/gpu/drm/v3d/v3d_drv.c
> @@ -94,6 +94,9 @@ static int v3d_get_param_ioctl(struct drm_device *dev, void *data,
>  	case DRM_V3D_PARAM_SUPPORTS_CACHE_FLUSH:
>  		args->value = 1;
>  		return 0;
> +	case DRM_V3D_PARAM_SUPPORTS_PERFMON:
> +		args->value = (v3d->ver >= 40);
> +		return 0;
>  	default:
>  		DRM_DEBUG("Unknown parameter %d\n", args->param);
>  		return -EINVAL;
> @@ -121,6 +124,7 @@ v3d_open(struct drm_device *dev, struct drm_file *file)
>  				      1, NULL);
>  	}
>  
> +	v3d_perfmon_open_file(v3d_priv);
>  	file->driver_priv = v3d_priv;
>  
>  	return 0;
> @@ -136,6 +140,7 @@ v3d_postclose(struct drm_device *dev, struct drm_file *file)
>  		drm_sched_entity_destroy(&v3d_priv->sched_entity[q]);
>  	}
>  
> +	v3d_perfmon_close_file(v3d_priv);
>  	kfree(v3d_priv);
>  }
>  
> @@ -156,6 +161,9 @@ static const struct drm_ioctl_desc v3d_drm_ioctls[] = {
>  	DRM_IOCTL_DEF_DRV(V3D_GET_BO_OFFSET, v3d_get_bo_offset_ioctl, DRM_RENDER_ALLOW),
>  	DRM_IOCTL_DEF_DRV(V3D_SUBMIT_TFU, v3d_submit_tfu_ioctl, DRM_RENDER_ALLOW | DRM_AUTH),
>  	DRM_IOCTL_DEF_DRV(V3D_SUBMIT_CSD, v3d_submit_csd_ioctl, DRM_RENDER_ALLOW | DRM_AUTH),
> +	DRM_IOCTL_DEF_DRV(V3D_PERFMON_CREATE, v3d_perfmon_create_ioctl, DRM_RENDER_ALLOW),
> +	DRM_IOCTL_DEF_DRV(V3D_PERFMON_DESTROY, v3d_perfmon_destroy_ioctl, DRM_RENDER_ALLOW),
> +	DRM_IOCTL_DEF_DRV(V3D_PERFMON_GET_VALUES, v3d_perfmon_get_values_ioctl, DRM_RENDER_ALLOW),
>  };
>  
>  static const struct drm_driver v3d_drm_driver = {
> diff --git a/drivers/gpu/drm/v3d/v3d_drv.h b/drivers/gpu/drm/v3d/v3d_drv.h
> index 8a390738d65b..270134779073 100644
> --- a/drivers/gpu/drm/v3d/v3d_drv.h
> +++ b/drivers/gpu/drm/v3d/v3d_drv.h
> @@ -37,6 +37,40 @@ struct v3d_queue_state {
>  	u64 emit_seqno;
>  };
>  
> +/* Performance monitor object. The perform lifetime is controlled by userspace
> + * using perfmon related ioctls. A perfmon can be attached to a submit_cl
> + * request, and when this is the case, HW perf counters will be activated just
> + * before the submit_cl is submitted to the GPU and disabled when the job is
> + * done. This way, only events related to a specific job will be counted.
> + */
> +struct v3d_perfmon {
> +	/* Tracks the number of users of the perfmon, when this counter reaches
> +	 * zero the perfmon is destroyed.
> +	 */
> +	refcount_t refcnt;
> +
> +	/* Protects perfmon stop, as it can be invoked from multiple places. */
> +	struct mutex lock;
> +
> +	/* Number of counters activated in this perfmon instance
> +	 * (should be less than DRM_V3D_MAX_PERF_COUNTERS).
> +	 */
> +	u8 ncounters;
> +
> +	/* Events counted by the HW perf counters. */
> +	u8 counters[DRM_V3D_MAX_PERF_COUNTERS];
> +
> +	/* Storage for counter values. Counters are incremented by the
> +	 * HW perf counter values every time the perfmon is attached
> +	 * to a GPU job.  This way, perfmon users don't have to
> +	 * retrieve the results after each job if they want to track
> +	 * events covering several submissions.  Note that counter
> +	 * values can't be reset, but you can fake a reset by
> +	 * destroying the perfmon and creating a new one.
> +	 */
> +	u64 values[];
> +};
> +
>  struct v3d_dev {
>  	struct drm_device drm;
>  
> @@ -89,6 +123,9 @@ struct v3d_dev {
>  	 */
>  	spinlock_t job_lock;
>  
> +	/* Used to track the active perfmon if any. */
> +	struct v3d_perfmon *active_perfmon;
> +
>  	/* Protects bo_stats */
>  	struct mutex bo_lock;
>  
> @@ -133,6 +170,11 @@ v3d_has_csd(struct v3d_dev *v3d)
>  struct v3d_file_priv {
>  	struct v3d_dev *v3d;
>  
> +	struct {
> +		struct idr idr;
> +		struct mutex lock;
> +	} perfmon;
> +
>  	struct drm_sched_entity sched_entity[V3D_MAX_QUEUES];
>  };
>  
> @@ -205,6 +247,11 @@ struct v3d_job {
>  	 */
>  	struct dma_fence *done_fence;
>  
> +	/* Pointer to a performance monitor object if the user requested it,
> +	 * NULL otherwise.
> +	 */
> +	struct v3d_perfmon *perfmon;
> +
>  	/* Callback for the freeing of the job on refcount going to 0. */
>  	void (*free)(struct kref *ref);
>  };
> @@ -353,3 +400,19 @@ void v3d_mmu_remove_ptes(struct v3d_bo *bo);
>  /* v3d_sched.c */
>  int v3d_sched_init(struct v3d_dev *v3d);
>  void v3d_sched_fini(struct v3d_dev *v3d);
> +
> +/* v3d_perfmon.c */
> +void v3d_perfmon_get(struct v3d_perfmon *perfmon);
> +void v3d_perfmon_put(struct v3d_perfmon *perfmon);
> +void v3d_perfmon_start(struct v3d_dev *v3d, struct v3d_perfmon *perfmon);
> +void v3d_perfmon_stop(struct v3d_dev *v3d, struct v3d_perfmon *perfmon,
> +		      bool capture);
> +struct v3d_perfmon *v3d_perfmon_find(struct v3d_file_priv *v3d_priv, int id);
> +void v3d_perfmon_open_file(struct v3d_file_priv *v3d_priv);
> +void v3d_perfmon_close_file(struct v3d_file_priv *v3d_priv);
> +int v3d_perfmon_create_ioctl(struct drm_device *dev, void *data,
> +			     struct drm_file *file_priv);
> +int v3d_perfmon_destroy_ioctl(struct drm_device *dev, void *data,
> +			      struct drm_file *file_priv);
> +int v3d_perfmon_get_values_ioctl(struct drm_device *dev, void *data,
> +				 struct drm_file *file_priv);
> diff --git a/drivers/gpu/drm/v3d/v3d_gem.c b/drivers/gpu/drm/v3d/v3d_gem.c
> index 4eb354226972..5689da118197 100644
> --- a/drivers/gpu/drm/v3d/v3d_gem.c
> +++ b/drivers/gpu/drm/v3d/v3d_gem.c
> @@ -126,6 +126,8 @@ v3d_reset(struct v3d_dev *v3d)
>  	v3d_mmu_set_page_table(v3d);
>  	v3d_irq_reset(v3d);
>  
> +	v3d_perfmon_stop(v3d, v3d->active_perfmon, false);
> +
>  	trace_v3d_reset_end(dev);
>  }
>  
> @@ -375,6 +377,9 @@ v3d_job_free(struct kref *ref)
>  	pm_runtime_mark_last_busy(job->v3d->drm.dev);
>  	pm_runtime_put_autosuspend(job->v3d->drm.dev);
>  
> +	if (job->perfmon)
> +		v3d_perfmon_put(job->perfmon);
> +
>  	kfree(job);
>  }
>  
> @@ -539,6 +544,9 @@ v3d_submit_cl_ioctl(struct drm_device *dev, void *data,
>  
>  	trace_v3d_submit_cl_ioctl(&v3d->drm, args->rcl_start, args->rcl_end);
>  
> +	if (args->pad != 0)
> +		return -EINVAL;
> +
>  	if (args->flags != 0 &&
>  	    args->flags != DRM_V3D_SUBMIT_CL_FLUSH_CACHE) {
>  		DRM_INFO("invalid flags: %d\n", args->flags);
> @@ -611,8 +619,20 @@ v3d_submit_cl_ioctl(struct drm_device *dev, void *data,
>  	if (ret)
>  		goto fail;
>  
> +	if (args->perfmon_id) {
> +		render->base.perfmon = v3d_perfmon_find(v3d_priv,
> +							args->perfmon_id);
> +
> +		if (!render->base.perfmon) {
> +			ret = -ENOENT;
> +			goto fail;
> +		}
> +	}
> +
>  	mutex_lock(&v3d->sched_lock);
>  	if (bin) {
> +		bin->base.perfmon = render->base.perfmon;
> +		v3d_perfmon_get(bin->base.perfmon);
>  		ret = v3d_push_job(v3d_priv, &bin->base, V3D_BIN);
>  		if (ret)
>  			goto fail_unreserve;
> @@ -633,6 +653,8 @@ v3d_submit_cl_ioctl(struct drm_device *dev, void *data,
>  		ret = drm_gem_fence_array_add(&clean_job->deps, render_fence);
>  		if (ret)
>  			goto fail_unreserve;
> +		clean_job->perfmon = render->base.perfmon;
> +		v3d_perfmon_get(clean_job->perfmon);
>  		ret = v3d_push_job(v3d_priv, clean_job, V3D_CACHE_CLEAN);
>  		if (ret)
>  			goto fail_unreserve;
> @@ -827,6 +849,15 @@ v3d_submit_csd_ioctl(struct drm_device *dev, void *data,
>  	if (ret)
>  		goto fail;
>  
> +	if (args->perfmon_id) {
> +		job->base.perfmon = v3d_perfmon_find(v3d_priv,
> +						     args->perfmon_id);
> +		if (!job->base.perfmon) {
> +			ret = -ENOENT;
> +			goto fail;
> +		}
> +	}
> +
>  	mutex_lock(&v3d->sched_lock);
>  	ret = v3d_push_job(v3d_priv, &job->base, V3D_CSD);
>  	if (ret)
> diff --git a/drivers/gpu/drm/v3d/v3d_perfmon.c b/drivers/gpu/drm/v3d/v3d_perfmon.c
> new file mode 100644
> index 000000000000..0288ef063513
> --- /dev/null
> +++ b/drivers/gpu/drm/v3d/v3d_perfmon.c
> @@ -0,0 +1,213 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (C) 2021 Raspberry Pi
> + */
> +
> +#include "v3d_drv.h"
> +#include "v3d_regs.h"
> +
> +#define V3D_PERFMONID_MIN	1
> +#define V3D_PERFMONID_MAX	U32_MAX
> +
> +void v3d_perfmon_get(struct v3d_perfmon *perfmon)
> +{
> +	if (perfmon)
> +		refcount_inc(&perfmon->refcnt);
> +}
> +
> +void v3d_perfmon_put(struct v3d_perfmon *perfmon)
> +{
> +	if (perfmon && refcount_dec_and_test(&perfmon->refcnt))
> +		kfree(perfmon);
> +}
> +
> +void v3d_perfmon_start(struct v3d_dev *v3d, struct v3d_perfmon *perfmon)
> +{
> +	unsigned int i;
> +	u32 mask;
> +	u8 ncounters = perfmon->ncounters;
> +
> +	if (WARN_ON_ONCE(!perfmon || v3d->active_perfmon))
> +		return;
> +
> +	mask = GENMASK(ncounters - 1, 0);
> +
> +	for (i = 0; i < ncounters; i++) {
> +		u32 source = i / 4;
> +		u32 channel = V3D_SET_FIELD(perfmon->counters[i], V3D_PCTR_S0);
> +
> +		i++;
> +		channel |= V3D_SET_FIELD(i < ncounters ? perfmon->counters[i] : 0,
> +					 V3D_PCTR_S1);
> +		i++;
> +		channel |= V3D_SET_FIELD(i < ncounters ? perfmon->counters[i] : 0,
> +					 V3D_PCTR_S2);
> +		i++;
> +		channel |= V3D_SET_FIELD(i < ncounters ? perfmon->counters[i] : 0,
> +					 V3D_PCTR_S3);
> +		V3D_CORE_WRITE(0, V3D_V4_PCTR_0_SRC_X(source), channel);
> +	}
> +
> +	V3D_CORE_WRITE(0, V3D_V4_PCTR_0_CLR, mask);
> +	V3D_CORE_WRITE(0, V3D_PCTR_0_OVERFLOW, mask);
> +	V3D_CORE_WRITE(0, V3D_V4_PCTR_0_EN, mask);
> +
> +	v3d->active_perfmon = perfmon;
> +}
> +
> +void v3d_perfmon_stop(struct v3d_dev *v3d, struct v3d_perfmon *perfmon,
> +		      bool capture)
> +{
> +	unsigned int i;
> +
> +	if (!perfmon || !v3d->active_perfmon)
> +		return;
> +
> +	mutex_lock(&perfmon->lock);
> +	if (perfmon != v3d->active_perfmon) {
> +		mutex_unlock(&perfmon->lock);
> +		return;
> +	}
> +
> +	if (capture)
> +		for (i = 0; i < perfmon->ncounters; i++)
> +			perfmon->values[i] += V3D_CORE_READ(0, V3D_PCTR_0_PCTRX(i));
> +
> +	V3D_CORE_WRITE(0, V3D_V4_PCTR_0_EN, 0);
> +
> +	v3d->active_perfmon = NULL;
> +	mutex_unlock(&perfmon->lock);
> +}
> +
> +struct v3d_perfmon *v3d_perfmon_find(struct v3d_file_priv *v3d_priv, int id)
> +{
> +	struct v3d_perfmon *perfmon;
> +
> +	mutex_lock(&v3d_priv->perfmon.lock);
> +	perfmon = idr_find(&v3d_priv->perfmon.idr, id);
> +	v3d_perfmon_get(perfmon);
> +	mutex_unlock(&v3d_priv->perfmon.lock);
> +
> +	return perfmon;
> +}
> +
> +void v3d_perfmon_open_file(struct v3d_file_priv *v3d_priv)
> +{
> +	mutex_init(&v3d_priv->perfmon.lock);
> +	idr_init(&v3d_priv->perfmon.idr);
> +}
> +
> +static int v3d_perfmon_idr_del(int id, void *elem, void *data)
> +{
> +	struct v3d_perfmon *perfmon = elem;
> +
> +	v3d_perfmon_put(perfmon);
> +
> +	return 0;
> +}
> +
> +void v3d_perfmon_close_file(struct v3d_file_priv *v3d_priv)
> +{
> +	mutex_lock(&v3d_priv->perfmon.lock);
> +	idr_for_each(&v3d_priv->perfmon.idr, v3d_perfmon_idr_del, NULL);
> +	idr_destroy(&v3d_priv->perfmon.idr);
> +	mutex_unlock(&v3d_priv->perfmon.lock);
> +}
> +
> +int v3d_perfmon_create_ioctl(struct drm_device *dev, void *data,
> +			     struct drm_file *file_priv)
> +{
> +	struct v3d_file_priv *v3d_priv = file_priv->driver_priv;
> +	struct drm_v3d_perfmon_create *req = data;
> +	struct v3d_perfmon *perfmon;
> +	unsigned int i;
> +	int ret;
> +
> +	/* Number of monitored counters cannot exceed HW limits. */
> +	if (req->ncounters > DRM_V3D_MAX_PERF_COUNTERS ||
> +	    !req->ncounters)
> +		return -EINVAL;
> +
> +	/* Make sure all counters are valid. */
> +	for (i = 0; i < req->ncounters; i++) {
> +		if (req->counters[i] >= V3D_PERFCNT_NUM)
> +			return -EINVAL;
> +	}
> +
> +	perfmon = kzalloc(struct_size(perfmon, values, req->ncounters),
> +			  GFP_KERNEL);
> +	if (!perfmon)
> +		return -ENOMEM;
> +
> +	for (i = 0; i < req->ncounters; i++)
> +		perfmon->counters[i] = req->counters[i];
> +
> +	perfmon->ncounters = req->ncounters;
> +
> +	refcount_set(&perfmon->refcnt, 1);
> +	mutex_init(&perfmon->lock);
> +
> +	mutex_lock(&v3d_priv->perfmon.lock);
> +	ret = idr_alloc(&v3d_priv->perfmon.idr, perfmon, V3D_PERFMONID_MIN,
> +			V3D_PERFMONID_MAX, GFP_KERNEL);
> +	mutex_unlock(&v3d_priv->perfmon.lock);
> +
> +	if (ret < 0) {
> +		kfree(perfmon);
> +		return ret;
> +	}
> +
> +	req->id = ret;
> +
> +	return 0;
> +}
> +
> +int v3d_perfmon_destroy_ioctl(struct drm_device *dev, void *data,
> +			      struct drm_file *file_priv)
> +{
> +	struct v3d_file_priv *v3d_priv = file_priv->driver_priv;
> +	struct drm_v3d_perfmon_destroy *req = data;
> +	struct v3d_perfmon *perfmon;
> +
> +	mutex_lock(&v3d_priv->perfmon.lock);
> +	perfmon = idr_remove(&v3d_priv->perfmon.idr, req->id);
> +	mutex_unlock(&v3d_priv->perfmon.lock);
> +
> +	if (!perfmon)
> +		return -EINVAL;
> +
> +	v3d_perfmon_put(perfmon);
> +
> +	return 0;
> +}
> +
> +int v3d_perfmon_get_values_ioctl(struct drm_device *dev, void *data,
> +				 struct drm_file *file_priv)
> +{
> +	struct v3d_dev *v3d = to_v3d_dev(dev);
> +	struct v3d_file_priv *v3d_priv = file_priv->driver_priv;
> +	struct drm_v3d_perfmon_get_values *req = data;
> +	struct v3d_perfmon *perfmon;
> +	int ret = 0;
> +
> +	if (req->pad != 0)
> +		return -EINVAL;
> +
> +	mutex_lock(&v3d_priv->perfmon.lock);
> +	perfmon = idr_find(&v3d_priv->perfmon.idr, req->id);
> +	v3d_perfmon_get(perfmon);
> +	mutex_unlock(&v3d_priv->perfmon.lock);
> +
> +	if (!perfmon)
> +		return -EINVAL;
> +
> +	v3d_perfmon_stop(v3d, perfmon, true);
> +
> +	if (copy_to_user(u64_to_user_ptr(req->values_ptr), perfmon->values,
> +			 perfmon->ncounters * sizeof(u64)))
> +		ret = -EFAULT;
> +
> +	v3d_perfmon_put(perfmon);
> +
> +	return ret;
> +}
> diff --git a/drivers/gpu/drm/v3d/v3d_regs.h b/drivers/gpu/drm/v3d/v3d_regs.h
> index 9bcb57781d31..3663e0d6bf76 100644
> --- a/drivers/gpu/drm/v3d/v3d_regs.h
> +++ b/drivers/gpu/drm/v3d/v3d_regs.h
> @@ -347,6 +347,8 @@
>  /* Each src reg muxes four counters each. */
>  #define V3D_V4_PCTR_0_SRC_0_3                          0x00660
>  #define V3D_V4_PCTR_0_SRC_28_31                        0x0067c
> +#define V3D_V4_PCTR_0_SRC_X(x)                         (V3D_V4_PCTR_0_SRC_0_3 + \
> +							4 * (x))
>  # define V3D_PCTR_S0_MASK                              V3D_MASK(6, 0)
>  # define V3D_PCTR_S0_SHIFT                             0
>  # define V3D_PCTR_S1_MASK                              V3D_MASK(14, 8)
> diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c
> index 8992480c88fa..c9a5c916d6eb 100644
> --- a/drivers/gpu/drm/v3d/v3d_sched.c
> +++ b/drivers/gpu/drm/v3d/v3d_sched.c
> @@ -63,6 +63,16 @@ v3d_job_free(struct drm_sched_job *sched_job)
>  	v3d_job_put(job);
>  }
>  
> +static void
> +v3d_switch_perfmon(struct v3d_dev *v3d, struct v3d_job *job)
> +{
> +	if (job->perfmon != v3d->active_perfmon)
> +		v3d_perfmon_stop(v3d, v3d->active_perfmon, true);
> +
> +	if (job->perfmon && v3d->active_perfmon != job->perfmon)
> +		v3d_perfmon_start(v3d, job->perfmon);
> +}
> +
>  /*
>   * Returns the fences that the job depends on, one by one.
>   *
> @@ -120,6 +130,8 @@ static struct dma_fence *v3d_bin_job_run(struct drm_sched_job *sched_job)
>  	trace_v3d_submit_cl(dev, false, to_v3d_fence(fence)->seqno,
>  			    job->start, job->end);
>  
> +	v3d_switch_perfmon(v3d, &job->base);
> +
>  	/* Set the current and end address of the control list.
>  	 * Writing the end register is what starts the job.
>  	 */
> @@ -169,6 +181,8 @@ static struct dma_fence *v3d_render_job_run(struct drm_sched_job *sched_job)
>  	trace_v3d_submit_cl(dev, true, to_v3d_fence(fence)->seqno,
>  			    job->start, job->end);
>  
> +	v3d_switch_perfmon(v3d, &job->base);
> +
>  	/* XXX: Set the QCFG */
>  
>  	/* Set the current and end address of the control list.
> @@ -240,6 +254,8 @@ v3d_csd_job_run(struct drm_sched_job *sched_job)
>  
>  	trace_v3d_submit_csd(dev, to_v3d_fence(fence)->seqno);
>  
> +	v3d_switch_perfmon(v3d, &job->base);
> +
>  	for (i = 1; i <= 6; i++)
>  		V3D_CORE_WRITE(0, V3D_CSD_QUEUED_CFG0 + 4 * i, job->args.cfg[i]);
>  	/* CFG0 write kicks off the job. */
> diff --git a/include/uapi/drm/v3d_drm.h b/include/uapi/drm/v3d_drm.h
> index 1ce746e228d9..4104f22fb3d3 100644
> --- a/include/uapi/drm/v3d_drm.h
> +++ b/include/uapi/drm/v3d_drm.h
> @@ -38,6 +38,9 @@ extern "C" {
>  #define DRM_V3D_GET_BO_OFFSET                     0x05
>  #define DRM_V3D_SUBMIT_TFU                        0x06
>  #define DRM_V3D_SUBMIT_CSD                        0x07
> +#define DRM_V3D_PERFMON_CREATE                    0x08
> +#define DRM_V3D_PERFMON_DESTROY                   0x09
> +#define DRM_V3D_PERFMON_GET_VALUES                0x0a
>  
>  #define DRM_IOCTL_V3D_SUBMIT_CL           DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_SUBMIT_CL, struct drm_v3d_submit_cl)
>  #define DRM_IOCTL_V3D_WAIT_BO             DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_WAIT_BO, struct drm_v3d_wait_bo)
> @@ -47,6 +50,12 @@ extern "C" {
>  #define DRM_IOCTL_V3D_GET_BO_OFFSET       DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_GET_BO_OFFSET, struct drm_v3d_get_bo_offset)
>  #define DRM_IOCTL_V3D_SUBMIT_TFU          DRM_IOW(DRM_COMMAND_BASE + DRM_V3D_SUBMIT_TFU, struct drm_v3d_submit_tfu)
>  #define DRM_IOCTL_V3D_SUBMIT_CSD          DRM_IOW(DRM_COMMAND_BASE + DRM_V3D_SUBMIT_CSD, struct drm_v3d_submit_csd)
> +#define DRM_IOCTL_V3D_PERFMON_CREATE      DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_PERFMON_CREATE, \
> +						   struct drm_v3d_perfmon_create)
> +#define DRM_IOCTL_V3D_PERFMON_DESTROY     DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_PERFMON_DESTROY, \
> +						   struct drm_v3d_perfmon_destroy)
> +#define DRM_IOCTL_V3D_PERFMON_GET_VALUES  DRM_IOWR(DRM_COMMAND_BASE + DRM_V3D_PERFMON_GET_VALUES, \
> +						   struct drm_v3d_perfmon_get_values)
>  
>  #define DRM_V3D_SUBMIT_CL_FLUSH_CACHE             0x01
>  
> @@ -127,6 +136,11 @@ struct drm_v3d_submit_cl {
>  	__u32 bo_handle_count;
>  
>  	__u32 flags;
> +
> +	/* ID of the perfmon to attach to this job. 0 means no perfmon. */
> +	__u32 perfmon_id;
> +
> +	__u32 pad;
>  };
>  
>  /**
> @@ -195,6 +209,7 @@ enum drm_v3d_param {
>  	DRM_V3D_PARAM_SUPPORTS_TFU,
>  	DRM_V3D_PARAM_SUPPORTS_CSD,
>  	DRM_V3D_PARAM_SUPPORTS_CACHE_FLUSH,
> +	DRM_V3D_PARAM_SUPPORTS_PERFMON,
>  };
>  
>  struct drm_v3d_get_param {
> @@ -258,6 +273,127 @@ struct drm_v3d_submit_csd {
>  	__u32 in_sync;
>  	/* Sync object to signal when the CSD job is done. */
>  	__u32 out_sync;
> +
> +	/* ID of the perfmon to attach to this job. 0 means no perfmon. */
> +	__u32 perfmon_id;
> +};
> +
> +enum {
> +	V3D_PERFCNT_FEP_VALID_PRIMTS_NO_PIXELS,
> +	V3D_PERFCNT_FEP_VALID_PRIMS,
> +	V3D_PERFCNT_FEP_EZ_NFCLIP_QUADS,
> +	V3D_PERFCNT_FEP_VALID_QUADS,
> +	V3D_PERFCNT_TLB_QUADS_STENCIL_FAIL,
> +	V3D_PERFCNT_TLB_QUADS_STENCILZ_FAIL,
> +	V3D_PERFCNT_TLB_QUADS_STENCILZ_PASS,
> +	V3D_PERFCNT_TLB_QUADS_ZERO_COV,
> +	V3D_PERFCNT_TLB_QUADS_NONZERO_COV,
> +	V3D_PERFCNT_TLB_QUADS_WRITTEN,
> +	V3D_PERFCNT_PTB_PRIM_VIEWPOINT_DISCARD,
> +	V3D_PERFCNT_PTB_PRIM_CLIP,
> +	V3D_PERFCNT_PTB_PRIM_REV,
> +	V3D_PERFCNT_QPU_IDLE_CYCLES,
> +	V3D_PERFCNT_QPU_ACTIVE_CYCLES_VERTEX_COORD_USER,
> +	V3D_PERFCNT_QPU_ACTIVE_CYCLES_FRAG,
> +	V3D_PERFCNT_QPU_CYCLES_VALID_INSTR,
> +	V3D_PERFCNT_QPU_CYCLES_TMU_STALL,
> +	V3D_PERFCNT_QPU_CYCLES_SCOREBOARD_STALL,
> +	V3D_PERFCNT_QPU_CYCLES_VARYINGS_STALL,
> +	V3D_PERFCNT_QPU_IC_HIT,
> +	V3D_PERFCNT_QPU_IC_MISS,
> +	V3D_PERFCNT_QPU_UC_HIT,
> +	V3D_PERFCNT_QPU_UC_MISS,
> +	V3D_PERFCNT_TMU_TCACHE_ACCESS,
> +	V3D_PERFCNT_TMU_TCACHE_MISS,
> +	V3D_PERFCNT_VPM_VDW_STALL,
> +	V3D_PERFCNT_VPM_VCD_STALL,
> +	V3D_PERFCNT_BIN_ACTIVE,
> +	V3D_PERFCNT_RDR_ACTIVE,
> +	V3D_PERFCNT_L2T_HITS,
> +	V3D_PERFCNT_L2T_MISSES,
> +	V3D_PERFCNT_CYCLE_COUNT,
> +	V3D_PERFCNT_QPU_CYCLES_STALLED_VERTEX_COORD_USER,
> +	V3D_PERFCNT_QPU_CYCLES_STALLED_FRAGMENT,
> +	V3D_PERFCNT_PTB_PRIMS_BINNED,
> +	V3D_PERFCNT_AXI_WRITES_WATCH_0,
> +	V3D_PERFCNT_AXI_READS_WATCH_0,
> +	V3D_PERFCNT_AXI_WRITE_STALLS_WATCH_0,
> +	V3D_PERFCNT_AXI_READ_STALLS_WATCH_0,
> +	V3D_PERFCNT_AXI_WRITE_BYTES_WATCH_0,
> +	V3D_PERFCNT_AXI_READ_BYTES_WATCH_0,
> +	V3D_PERFCNT_AXI_WRITES_WATCH_1,
> +	V3D_PERFCNT_AXI_READS_WATCH_1,
> +	V3D_PERFCNT_AXI_WRITE_STALLS_WATCH_1,
> +	V3D_PERFCNT_AXI_READ_STALLS_WATCH_1,
> +	V3D_PERFCNT_AXI_WRITE_BYTES_WATCH_1,
> +	V3D_PERFCNT_AXI_READ_BYTES_WATCH_1,
> +	V3D_PERFCNT_TLB_PARTIAL_QUADS,
> +	V3D_PERFCNT_TMU_CONFIG_ACCESSES,
> +	V3D_PERFCNT_L2T_NO_ID_STALL,
> +	V3D_PERFCNT_L2T_COM_QUE_STALL,
> +	V3D_PERFCNT_L2T_TMU_WRITES,
> +	V3D_PERFCNT_TMU_ACTIVE_CYCLES,
> +	V3D_PERFCNT_TMU_STALLED_CYCLES,
> +	V3D_PERFCNT_CLE_ACTIVE,
> +	V3D_PERFCNT_L2T_TMU_READS,
> +	V3D_PERFCNT_L2T_CLE_READS,
> +	V3D_PERFCNT_L2T_VCD_READS,
> +	V3D_PERFCNT_L2T_TMUCFG_READS,
> +	V3D_PERFCNT_L2T_SLC0_READS,
> +	V3D_PERFCNT_L2T_SLC1_READS,
> +	V3D_PERFCNT_L2T_SLC2_READS,
> +	V3D_PERFCNT_L2T_TMU_W_MISSES,
> +	V3D_PERFCNT_L2T_TMU_R_MISSES,
> +	V3D_PERFCNT_L2T_CLE_MISSES,
> +	V3D_PERFCNT_L2T_VCD_MISSES,
> +	V3D_PERFCNT_L2T_TMUCFG_MISSES,
> +	V3D_PERFCNT_L2T_SLC0_MISSES,
> +	V3D_PERFCNT_L2T_SLC1_MISSES,
> +	V3D_PERFCNT_L2T_SLC2_MISSES,
> +	V3D_PERFCNT_CORE_MEM_WRITES,
> +	V3D_PERFCNT_L2T_MEM_WRITES,
> +	V3D_PERFCNT_PTB_MEM_WRITES,
> +	V3D_PERFCNT_TLB_MEM_WRITES,
> +	V3D_PERFCNT_CORE_MEM_READS,
> +	V3D_PERFCNT_L2T_MEM_READS,
> +	V3D_PERFCNT_PTB_MEM_READS,
> +	V3D_PERFCNT_PSE_MEM_READS,
> +	V3D_PERFCNT_TLB_MEM_READS,
> +	V3D_PERFCNT_GMP_MEM_READS,
> +	V3D_PERFCNT_PTB_W_MEM_WORDS,
> +	V3D_PERFCNT_TLB_W_MEM_WORDS,
> +	V3D_PERFCNT_PSE_R_MEM_WORDS,
> +	V3D_PERFCNT_TLB_R_MEM_WORDS,
> +	V3D_PERFCNT_TMU_MRU_HITS,
> +	V3D_PERFCNT_COMPUTE_ACTIVE,
> +	V3D_PERFCNT_NUM,
> +};
> +
> +#define DRM_V3D_MAX_PERF_COUNTERS                 32
> +
> +struct drm_v3d_perfmon_create {
> +	__u32 id;
> +	__u32 ncounters;
> +	__u8 counters[DRM_V3D_MAX_PERF_COUNTERS];
> +};
> +
> +struct drm_v3d_perfmon_destroy {
> +	__u32 id;
> +};
> +
> +/*
> + * Returns the values of the performance counters tracked by this
> + * perfmon (as an array of ncounters u64 values).
> + *
> + * No implicit synchronization is performed, so the user has to
> + * guarantee that any jobs using this perfmon have already been
> + * completed  (probably by blocking on the seqno returned by the
> + * last exec that used the perfmon).
> + */
> +struct drm_v3d_perfmon_get_values {
> +	__u32 id;
> +	__u32 pad;
> +	__u64 values_ptr;
>  };
>  
>  #if defined(__cplusplus)
> -- 
> 2.25.1
> 


More information about the dri-devel mailing list