[PATCH v2] drm/vc4: Expose performance counters to userspace

Eric Anholt eric at anholt.net
Fri Jan 12 00:35:08 UTC 2018


Boris Brezillon <boris.brezillon at free-electrons.com> writes:

> The V3D engine has various hardware counters which might be interesting
> to userspace performance analysis tools.
>
> Expose new ioctls to create/destroy a performance monitor object and
> query the counter values of this perfmance monitor.
>
> Note that a perfomance monitor is given an ID that is only valid on the
> file descriptor it has been allocated from. A performance monitor can be
> attached to a CL submission and the driver will enable HW counters for
> this request and update the performance monitor values at the end of the
> job.
>
> Signed-off-by: Boris Brezillon <boris.brezillon at free-electrons.com>
> ---
> Changes in v2:
> - Get rid of the CL extension stuff
> - Fix isolation of jobs when perfmon attached to them are different
> - Add more comments in the code
> - Use an SPDX header for vc4_perfmon.c
> - Consider 0 as an invalid perfmonid to be backward compatible with mesa
>   versions that lack perfmon support
> ---
>  drivers/gpu/drm/vc4/Makefile      |   1 +
>  drivers/gpu/drm/vc4/vc4_drv.c     |  26 ++++++
>  drivers/gpu/drm/vc4/vc4_drv.h     |  68 ++++++++++++++
>  drivers/gpu/drm/vc4/vc4_gem.c     |  48 +++++++++-
>  drivers/gpu/drm/vc4/vc4_irq.c     |  40 +++++++-
>  drivers/gpu/drm/vc4/vc4_perfmon.c | 188 ++++++++++++++++++++++++++++++++++++++
>  drivers/gpu/drm/vc4/vc4_regs.h    |  35 +------
>  drivers/gpu/drm/vc4/vc4_v3d.c     |  64 ++++++-------
>  include/uapi/drm/vc4_drm.h        |  67 ++++++++++++++
>  9 files changed, 465 insertions(+), 72 deletions(-)
>  create mode 100644 drivers/gpu/drm/vc4/vc4_perfmon.c
>
> diff --git a/drivers/gpu/drm/vc4/Makefile b/drivers/gpu/drm/vc4/Makefile
> index f5500df51686..4a3a868235f8 100644
> --- a/drivers/gpu/drm/vc4/Makefile
> +++ b/drivers/gpu/drm/vc4/Makefile
> @@ -15,6 +15,7 @@ vc4-y := \
>  	vc4_vec.o \
>  	vc4_hvs.o \
>  	vc4_irq.o \
> +	vc4_perfmon.o \
>  	vc4_plane.o \
>  	vc4_render_cl.o \
>  	vc4_trace_points.o \
> diff --git a/drivers/gpu/drm/vc4/vc4_drv.c b/drivers/gpu/drm/vc4/vc4_drv.c
> index ceb385fd69c5..94b99c90425a 100644
> --- a/drivers/gpu/drm/vc4/vc4_drv.c
> +++ b/drivers/gpu/drm/vc4/vc4_drv.c
> @@ -101,6 +101,7 @@ static int vc4_get_param_ioctl(struct drm_device *dev, void *data,
>  	case DRM_VC4_PARAM_SUPPORTS_THREADED_FS:
>  	case DRM_VC4_PARAM_SUPPORTS_FIXED_RCL_ORDER:
>  	case DRM_VC4_PARAM_SUPPORTS_MADVISE:
> +	case DRM_VC4_PARAM_SUPPORTS_PERFMON:
>  		args->value = true;
>  		break;
>  	default:
> @@ -111,6 +112,26 @@ static int vc4_get_param_ioctl(struct drm_device *dev, void *data,
>  	return 0;
>  }
>  
> +static int vc4_open(struct drm_device *dev, struct drm_file *file)
> +{
> +	struct vc4_file *vc4file;
> +
> +	vc4file = kzalloc(sizeof(*vc4file), GFP_KERNEL);
> +	if (!vc4file)
> +		return -ENOMEM;
> +
> +	vc4_perfmon_open_file(vc4file);
> +	file->driver_priv = vc4file;
> +	return 0;
> +}
> +
> +static void vc4_close(struct drm_device *dev, struct drm_file *file)
> +{
> +	struct vc4_file *vc4file = file->driver_priv;
> +
> +	vc4_perfmon_close_file(vc4file);
> +}
> +
>  static const struct vm_operations_struct vc4_vm_ops = {
>  	.fault = vc4_fault,
>  	.open = drm_gem_vm_open,
> @@ -143,6 +164,9 @@ static const struct drm_ioctl_desc vc4_drm_ioctls[] = {
>  	DRM_IOCTL_DEF_DRV(VC4_GET_TILING, vc4_get_tiling_ioctl, DRM_RENDER_ALLOW),
>  	DRM_IOCTL_DEF_DRV(VC4_LABEL_BO, vc4_label_bo_ioctl, DRM_RENDER_ALLOW),
>  	DRM_IOCTL_DEF_DRV(VC4_GEM_MADVISE, vc4_gem_madvise_ioctl, DRM_RENDER_ALLOW),
> +	DRM_IOCTL_DEF_DRV(VC4_PERFMON_CREATE, vc4_perfmon_create_ioctl, DRM_RENDER_ALLOW),
> +	DRM_IOCTL_DEF_DRV(VC4_PERFMON_DESTROY, vc4_perfmon_destroy_ioctl, DRM_RENDER_ALLOW),
> +	DRM_IOCTL_DEF_DRV(VC4_PERFMON_GET_VALUES, vc4_perfmon_get_values_ioctl, DRM_RENDER_ALLOW),
>  };
>  
>  static struct drm_driver vc4_drm_driver = {
> @@ -153,6 +177,8 @@ static struct drm_driver vc4_drm_driver = {
>  			    DRIVER_RENDER |
>  			    DRIVER_PRIME),
>  	.lastclose = drm_fb_helper_lastclose,
> +	.open = vc4_open,
> +	.postclose = vc4_close,
>  	.irq_handler = vc4_irq,
>  	.irq_preinstall = vc4_irq_preinstall,
>  	.irq_postinstall = vc4_irq_postinstall,
> diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h
> index 3af22936d9b3..fefa1664a9f5 100644
> --- a/drivers/gpu/drm/vc4/vc4_drv.h
> +++ b/drivers/gpu/drm/vc4/vc4_drv.h
> @@ -11,6 +11,8 @@
>  #include <drm/drm_encoder.h>
>  #include <drm/drm_gem_cma_helper.h>
>  
> +#include "uapi/drm/vc4_drm.h"
> +
>  /* Don't forget to update vc4_bo.c: bo_type_names[] when adding to
>   * this.
>   */
> @@ -29,6 +31,36 @@ enum vc4_kernel_bo_type {
>  	VC4_BO_TYPE_COUNT
>  };
>  
> +/* Performance monitor object. The perform lifetime is controlled by userspace
> + * using perfmon related ioctls. A perfmon can be attached to a submit_cl
> + * request, and when this is the case, HW perf counters will be activated just
> + * before the submit_cl is submitted to the GPU and disabled when the job is
> + * done. This way, only events related to a specific job will be counted.
> + */
> +struct vc4_perfmon {
> +	/* Tracks the number of users of the perfmon, when this counter reaches
> +	 * zero the perfmon is destroyed.
> +	 */
> +	refcount_t refcnt;
> +
> +	/* Number of counters activated in this perfmon instance
> +	 * (should be less than DRM_VC4_MAX_PERF_COUNTERS).
> +	 */
> +	u8 ncounters;
> +
> +	/* Events counted by the HW perf counters. */
> +	u8 events[DRM_VC4_MAX_PERF_COUNTERS];
> +
> +	/* Storage for counter values. Counters are incremented by the HW
> +	 * perf counter values every time the perfmon is attached to a GPU job.
> +	 * This way, perfmon users don't have to retrieve the results after
> +	 * each job if they want to track events covering several submissions.
> +	 * Note that counter values can't be reset, but you can fake a reset by
> +	 * destroying the perfmon and creating a new one.
> +	 */
> +	u64 counters[0];
> +};
> +
>  struct vc4_dev {
>  	struct drm_device *dev;
>  
> @@ -121,6 +153,11 @@ struct vc4_dev {
>  	wait_queue_head_t job_wait_queue;
>  	struct work_struct job_done_work;
>  
> +	/* Used to track the active perfmon if any. Access to this field is
> +	 * protected by job_lock.
> +	 */
> +	struct vc4_perfmon *active_perfmon;
> +
>  	/* List of struct vc4_seqno_cb for callbacks to be made from a
>  	 * workqueue when the given seqno is passed.
>  	 */
> @@ -406,6 +443,21 @@ struct vc4_exec_info {
>  	void *uniforms_v;
>  	uint32_t uniforms_p;
>  	uint32_t uniforms_size;
> +
> +	/* Pointer to a performance monitor object if the user requested it,
> +	 * NULL otherwise.
> +	 */
> +	struct vc4_perfmon *perfmon;
> +};
> +
> +/* Per-open file private data. Any driver-specific resource that has to be
> + * released when the DRM file is closed should be placed here.
> + */
> +struct vc4_file {
> +	struct {
> +		struct idr idr;
> +		struct mutex lock;
> +	} perfmon;
>  };
>  
>  static inline struct vc4_exec_info *
> @@ -646,3 +698,19 @@ bool vc4_check_tex_size(struct vc4_exec_info *exec,
>  /* vc4_validate_shader.c */
>  struct vc4_validated_shader_info *
>  vc4_validate_shader(struct drm_gem_cma_object *shader_obj);
> +
> +/* vc4_perfmon.c */
> +void vc4_perfmon_get(struct vc4_perfmon *perfmon);
> +void vc4_perfmon_put(struct vc4_perfmon *perfmon);
> +void vc4_perfmon_start(struct vc4_dev *vc4, struct vc4_perfmon *perfmon);
> +void vc4_perfmon_stop(struct vc4_dev *vc4, struct vc4_perfmon *perfmon,
> +		      bool capture);
> +struct vc4_perfmon *vc4_perfmon_find(struct vc4_file *vc4file, int id);
> +void vc4_perfmon_open_file(struct vc4_file *vc4file);
> +void vc4_perfmon_close_file(struct vc4_file *vc4file);
> +int vc4_perfmon_create_ioctl(struct drm_device *dev, void *data,
> +			     struct drm_file *file_priv);
> +int vc4_perfmon_destroy_ioctl(struct drm_device *dev, void *data,
> +			      struct drm_file *file_priv);
> +int vc4_perfmon_get_values_ioctl(struct drm_device *dev, void *data,
> +				 struct drm_file *file_priv);
> diff --git a/drivers/gpu/drm/vc4/vc4_gem.c b/drivers/gpu/drm/vc4/vc4_gem.c
> index 19ac7fe0e5db..c0589d44e9e1 100644
> --- a/drivers/gpu/drm/vc4/vc4_gem.c
> +++ b/drivers/gpu/drm/vc4/vc4_gem.c
> @@ -454,14 +454,30 @@ vc4_submit_next_bin_job(struct drm_device *dev)
>  
>  	vc4_flush_caches(dev);
>  
> +	/* Only start the perfmon if it was not already started by a previous
> +	 * job.
> +	 */
> +	if (exec->perfmon && vc4->active_perfmon != exec->perfmon)
> +		vc4_perfmon_start(vc4, exec->perfmon);
> +
>  	/* Either put the job in the binner if it uses the binner, or
>  	 * immediately move it to the to-be-rendered queue.
>  	 */
>  	if (exec->ct0ca != exec->ct0ea) {
>  		submit_cl(dev, 0, exec->ct0ca, exec->ct0ea);
>  	} else {
> +		struct vc4_exec_info *next;
> +
>  		vc4_move_job_to_render(dev, exec);
> -		goto again;
> +		next = vc4_first_bin_job(vc4);
> +
> +		/* We can't start the next bin job if the previous job had a
> +		 * different perfmon instance attached to it. The same goes
> +		 * if one of them had a perfmon attached to it and the other
> +		 * one doesn't.
> +		 */
> +		if (next && next->perfmon == exec->perfmon)
> +			goto again;
>  	}
>  }
>  
> @@ -621,6 +637,7 @@ vc4_queue_submit(struct drm_device *dev, struct vc4_exec_info *exec,
>  		 struct ww_acquire_ctx *acquire_ctx)
>  {
>  	struct vc4_dev *vc4 = to_vc4_dev(dev);
> +	struct vc4_exec_info *renderjob;
>  	uint64_t seqno;
>  	unsigned long irqflags;
>  	struct vc4_fence *fence;
> @@ -646,11 +663,14 @@ vc4_queue_submit(struct drm_device *dev, struct vc4_exec_info *exec,
>  
>  	list_add_tail(&exec->head, &vc4->bin_job_list);
>  
> -	/* If no job was executing, kick ours off.  Otherwise, it'll
> -	 * get started when the previous job's flush done interrupt
> -	 * occurs.
> +	/* If no bin job was executing and if the render job (if any) has the
> +	 * same perfmon as our job attached to it (or if both jobs don't have
> +	 * perfmon activated), then kick ours off.  Otherwise, it'll get
> +	 * started when the previous job's flush/render done interrupt occurs.
>  	 */
> -	if (vc4_first_bin_job(vc4) == exec) {
> +	renderjob = vc4_first_render_job(vc4);
> +	if (vc4_first_bin_job(vc4) == exec &&
> +	    (!renderjob || renderjob->perfmon == exec->perfmon)) {
>  		vc4_submit_next_bin_job(dev);
>  		vc4_queue_hangcheck(dev);
>  	}
> @@ -913,6 +933,9 @@ vc4_complete_exec(struct drm_device *dev, struct vc4_exec_info *exec)
>  	vc4->bin_alloc_used &= ~exec->bin_slots;
>  	spin_unlock_irqrestore(&vc4->job_lock, irqflags);
>  
> +	/* Release the reference we had on the perf monitor. */
> +	vc4_perfmon_put(exec->perfmon);
> +
>  	mutex_lock(&vc4->power_lock);
>  	if (--vc4->power_refcount == 0) {
>  		pm_runtime_mark_last_busy(&vc4->v3d->pdev->dev);
> @@ -1065,6 +1088,7 @@ vc4_submit_cl_ioctl(struct drm_device *dev, void *data,
>  		    struct drm_file *file_priv)
>  {
>  	struct vc4_dev *vc4 = to_vc4_dev(dev);
> +	struct vc4_file *vc4file = file_priv->driver_priv;
>  	struct drm_vc4_submit_cl *args = data;
>  	struct vc4_exec_info *exec;
>  	struct ww_acquire_ctx acquire_ctx;
> @@ -1078,6 +1102,11 @@ vc4_submit_cl_ioctl(struct drm_device *dev, void *data,
>  		return -EINVAL;
>  	}
>  
> +	if (args->pad2 != 0) {
> +		DRM_DEBUG("->pad2 must be set to zero\n");
> +		return -EINVAL;
> +	}
> +
>  	exec = kcalloc(1, sizeof(*exec), GFP_KERNEL);
>  	if (!exec) {
>  		DRM_ERROR("malloc failure on exec struct\n");
> @@ -1103,6 +1132,15 @@ vc4_submit_cl_ioctl(struct drm_device *dev, void *data,
>  	if (ret)
>  		goto fail;
>  
> +	if (args->perfmonid) {
> +		exec->perfmon = vc4_perfmon_find(vc4file,
> +						 args->perfmonid);
> +		if (!exec->perfmon) {
> +			ret = -ENOENT;
> +			goto fail;
> +		}
> +	}
> +
>  	if (exec->args->bin_cl_size != 0) {
>  		ret = vc4_get_bcl(dev, exec);
>  		if (ret)
> diff --git a/drivers/gpu/drm/vc4/vc4_irq.c b/drivers/gpu/drm/vc4/vc4_irq.c
> index 61b2e5377993..0e0b37635646 100644
> --- a/drivers/gpu/drm/vc4/vc4_irq.c
> +++ b/drivers/gpu/drm/vc4/vc4_irq.c
> @@ -104,13 +104,20 @@ static void
>  vc4_irq_finish_bin_job(struct drm_device *dev)
>  {
>  	struct vc4_dev *vc4 = to_vc4_dev(dev);
> -	struct vc4_exec_info *exec = vc4_first_bin_job(vc4);
> +	struct vc4_exec_info *next, *exec = vc4_first_bin_job(vc4);
>  
>  	if (!exec)
>  		return;
>  
>  	vc4_move_job_to_render(dev, exec);
> -	vc4_submit_next_bin_job(dev);
> +	next = vc4_first_bin_job(vc4);
> +
> +	/* Only submit the next job in the bin list if it matches the perfmon
> +	 * attached to the one that just finished (or if both jobs don't have
> +	 * perfmon attached to them).
> +	 */
> +	if (next && next->perfmon == exec->perfmon)
> +		vc4_submit_next_bin_job(dev);
>  }
>  
>  static void
> @@ -122,6 +129,10 @@ vc4_cancel_bin_job(struct drm_device *dev)
>  	if (!exec)
>  		return;
>  
> +	/* Stop the perfmon so that the next bin job can be started. */
> +	if (exec->perfmon)
> +		vc4_perfmon_stop(vc4, exec->perfmon, false);
> +
>  	list_move_tail(&exec->head, &vc4->bin_job_list);
>  	vc4_submit_next_bin_job(dev);
>  }
> @@ -131,17 +142,40 @@ vc4_irq_finish_render_job(struct drm_device *dev)
>  {
>  	struct vc4_dev *vc4 = to_vc4_dev(dev);
>  	struct vc4_exec_info *exec = vc4_first_render_job(vc4);
> +	struct vc4_exec_info *nextbin, *nextrender;
>  
>  	if (!exec)
>  		return;
>  
>  	vc4->finished_seqno++;
>  	list_move_tail(&exec->head, &vc4->job_done_list);
> +
> +	nextbin = vc4_first_bin_job(vc4);
> +	nextrender = vc4_first_render_job(vc4);
> +
> +	/* Only stop the perfmon if following jobs in the queue don't expect it
> +	 * to be enabled.
> +	 */
> +	if (exec->perfmon && !nextrender &&
> +	    (!nextbin || nextbin->perfmon != exec->perfmon))
> +		vc4_perfmon_stop(vc4, exec->perfmon, true);
> +
> +	/* If there's a render job waiting, start it. If this is not the case
> +	 * we may have to unblock the binner if it's been stalled because of
> +	 * perfmon (this can be checked by comparing the perfmon attached to
> +	 * the finished renderjob to the one attached to the next bin job: if
> +	 * they don't match, this means the binner is stalled and should be
> +	 * restarted).
> +	 */
> +	if (nextrender)
> +		vc4_submit_next_render_job(dev);
> +	else if (nextbin && nextbin->perfmon != exec->perfmon)
> +		vc4_submit_next_bin_job(dev);
> +
>  	if (exec->fence) {
>  		dma_fence_signal_locked(exec->fence);
>  		exec->fence = NULL;
>  	}
> -	vc4_submit_next_render_job(dev);
>  
>  	wake_up_all(&vc4->job_wait_queue);
>  	schedule_work(&vc4->job_done_work);
> diff --git a/drivers/gpu/drm/vc4/vc4_perfmon.c b/drivers/gpu/drm/vc4/vc4_perfmon.c
> new file mode 100644
> index 000000000000..437e7a27f21d
> --- /dev/null
> +++ b/drivers/gpu/drm/vc4/vc4_perfmon.c
> @@ -0,0 +1,188 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (C) 2018 Broadcom
> + */
> +
> +/**
> + * DOC: VC4 V3D performance monitor module
> + *
> + * The V3D block provides 16 hardware counters which can count various events.
> + */
> +
> +#include "vc4_drv.h"
> +#include "vc4_regs.h"
> +
> +#define VC4_PERFMONID_MIN	1
> +#define VC4_PERFMONID_MAX	U32_MAX
> +
> +void vc4_perfmon_get(struct vc4_perfmon *perfmon)
> +{
> +	if (perfmon)
> +		refcount_inc(&perfmon->refcnt);
> +}
> +
> +void vc4_perfmon_put(struct vc4_perfmon *perfmon)
> +{
> +	if (perfmon && refcount_dec_and_test(&perfmon->refcnt))
> +		kfree(perfmon);
> +}
> +
> +void vc4_perfmon_start(struct vc4_dev *vc4, struct vc4_perfmon *perfmon)
> +{
> +	unsigned int i;
> +	u32 mask;
> +
> +	if (WARN_ON_ONCE(!perfmon || vc4->active_perfmon))
> +		return;
> +
> +	for (i = 0; i < perfmon->ncounters; i++)
> +		V3D_WRITE(V3D_PCTRS(i), perfmon->events[i]);
> +
> +	mask = GENMASK(perfmon->ncounters - 1, 0);
> +	V3D_WRITE(V3D_PCTRC, mask);
> +	V3D_WRITE(V3D_PCTRE, V3D_PCTRE_EN | mask);
> +	vc4->active_perfmon = perfmon;
> +}
> +
> +void vc4_perfmon_stop(struct vc4_dev *vc4, struct vc4_perfmon *perfmon,
> +		      bool capture)
> +{
> +	unsigned int i;
> +
> +	if (WARN_ON_ONCE(!vc4->active_perfmon ||
> +			 perfmon != vc4->active_perfmon))
> +		return;
> +
> +	if (capture) {
> +		for (i = 0; i < perfmon->ncounters; i++)
> +			perfmon->counters[i] += V3D_READ(V3D_PCTR(i));
> +	}
> +
> +	V3D_WRITE(V3D_PCTRE, 0);
> +	vc4->active_perfmon = NULL;
> +}
> +
> +struct vc4_perfmon *vc4_perfmon_find(struct vc4_file *vc4file, int id)
> +{
> +	struct vc4_perfmon *perfmon;
> +
> +	mutex_lock(&vc4file->perfmon.lock);
> +	perfmon = idr_find(&vc4file->perfmon.idr, id);
> +	vc4_perfmon_get(perfmon);
> +	mutex_unlock(&vc4file->perfmon.lock);
> +
> +	return perfmon;
> +}
> +
> +void vc4_perfmon_open_file(struct vc4_file *vc4file)
> +{
> +	mutex_init(&vc4file->perfmon.lock);
> +	idr_init(&vc4file->perfmon.idr);
> +}
> +
> +static int vc4_perfmon_idr_del(int id, void *elem, void *data)
> +{
> +	struct vc4_perfmon *perfmon = elem;
> +
> +	vc4_perfmon_put(perfmon);
> +
> +	return 0;
> +}
> +
> +void vc4_perfmon_close_file(struct vc4_file *vc4file)
> +{
> +	mutex_lock(&vc4file->perfmon.lock);
> +	idr_for_each(&vc4file->perfmon.idr, vc4_perfmon_idr_del, NULL);
> +	idr_destroy(&vc4file->perfmon.idr);
> +	mutex_unlock(&vc4file->perfmon.lock);
> +}
> +
> +int vc4_perfmon_create_ioctl(struct drm_device *dev, void *data,
> +			     struct drm_file *file_priv)
> +{
> +	struct vc4_file *vc4file = file_priv->driver_priv;
> +	struct drm_vc4_perfmon_create *req = data;
> +	struct vc4_perfmon *perfmon;
> +	unsigned int i;
> +	int ret;
> +
> +	/* Number of monitored counters cannot exceed HW limits. */
> +	if (req->ncounters > DRM_VC4_MAX_PERF_COUNTERS ||
> +	    !req->ncounters)
> +		return -EINVAL;
> +
> +	/* Make sure all events are valid. */
> +	for (i = 0; i < req->ncounters; i++) {
> +		if (req->events[i] >= VC4_PERFCNT_NUM_EVENTS)
> +			return -EINVAL;
> +	}
> +
> +	perfmon = kzalloc(sizeof(*perfmon) + (req->ncounters * sizeof(u64)),
> +			  GFP_KERNEL);
> +	if (!perfmon)
> +		return -ENOMEM;
> +
> +	for (i = 0; i < req->ncounters; i++)
> +		perfmon->events[i] = req->events[i];
> +
> +	perfmon->ncounters = req->ncounters;
> +
> +	refcount_set(&perfmon->refcnt, 1);
> +
> +	mutex_lock(&vc4file->perfmon.lock);
> +	ret = idr_alloc(&vc4file->perfmon.idr, perfmon, VC4_PERFMONID_MIN,
> +			VC4_PERFMONID_MAX, GFP_KERNEL);
> +	mutex_unlock(&vc4file->perfmon.lock);
> +
> +	if (ret < 0) {
> +		kfree(perfmon);
> +		return ret;
> +	}
> +
> +	req->id = ret;
> +	return 0;
> +}
> +
> +int vc4_perfmon_destroy_ioctl(struct drm_device *dev, void *data,
> +			      struct drm_file *file_priv)
> +{
> +	struct vc4_file *vc4file = file_priv->driver_priv;
> +	struct drm_vc4_perfmon_destroy *req = data;
> +	struct vc4_perfmon *perfmon;
> +
> +	mutex_lock(&vc4file->perfmon.lock);
> +	perfmon = idr_remove(&vc4file->perfmon.idr, req->id);
> +	mutex_unlock(&vc4file->perfmon.lock);
> +
> +	if (!perfmon)
> +		return -EINVAL;
> +
> +	vc4_perfmon_put(perfmon);
> +	return 0;
> +}
> +
> +int vc4_perfmon_get_values_ioctl(struct drm_device *dev, void *data,
> +				 struct drm_file *file_priv)
> +{
> +	struct vc4_file *vc4file = file_priv->driver_priv;
> +	struct drm_vc4_perfmon_get_values *req = data;
> +	struct vc4_perfmon *perfmon;
> +	int ret;
> +
> +	mutex_lock(&vc4file->perfmon.lock);
> +	perfmon = idr_find(&vc4file->perfmon.idr, req->id);
> +	vc4_perfmon_get(perfmon);
> +	mutex_unlock(&vc4file->perfmon.lock);
> +
> +	if (!perfmon)
> +		return -EINVAL;
> +
> +	if (copy_to_user(u64_to_user_ptr(req->values_ptr), perfmon->counters,
> +			 perfmon->ncounters * sizeof(u64)))
> +		ret = -EFAULT;
> +	else
> +		ret = 0;
> +
> +	vc4_perfmon_put(perfmon);
> +	return ret;
> +}
> diff --git a/drivers/gpu/drm/vc4/vc4_regs.h b/drivers/gpu/drm/vc4/vc4_regs.h
> index 55677bd50f66..b9749cb24063 100644
> --- a/drivers/gpu/drm/vc4/vc4_regs.h
> +++ b/drivers/gpu/drm/vc4/vc4_regs.h
> @@ -122,38 +122,9 @@
>  #define V3D_VPMBASE  0x00504
>  #define V3D_PCTRC    0x00670
>  #define V3D_PCTRE    0x00674
> -#define V3D_PCTR0    0x00680
> -#define V3D_PCTRS0   0x00684
> -#define V3D_PCTR1    0x00688
> -#define V3D_PCTRS1   0x0068c
> -#define V3D_PCTR2    0x00690
> -#define V3D_PCTRS2   0x00694
> -#define V3D_PCTR3    0x00698
> -#define V3D_PCTRS3   0x0069c
> -#define V3D_PCTR4    0x006a0
> -#define V3D_PCTRS4   0x006a4
> -#define V3D_PCTR5    0x006a8
> -#define V3D_PCTRS5   0x006ac
> -#define V3D_PCTR6    0x006b0
> -#define V3D_PCTRS6   0x006b4
> -#define V3D_PCTR7    0x006b8
> -#define V3D_PCTRS7   0x006bc
> -#define V3D_PCTR8    0x006c0
> -#define V3D_PCTRS8   0x006c4
> -#define V3D_PCTR9    0x006c8
> -#define V3D_PCTRS9   0x006cc
> -#define V3D_PCTR10   0x006d0
> -#define V3D_PCTRS10  0x006d4
> -#define V3D_PCTR11   0x006d8
> -#define V3D_PCTRS11  0x006dc
> -#define V3D_PCTR12   0x006e0
> -#define V3D_PCTRS12  0x006e4
> -#define V3D_PCTR13   0x006e8
> -#define V3D_PCTRS13  0x006ec
> -#define V3D_PCTR14   0x006f0
> -#define V3D_PCTRS14  0x006f4
> -#define V3D_PCTR15   0x006f8
> -#define V3D_PCTRS15  0x006fc
> +# define V3D_PCTRE_EN	BIT(31)
> +#define V3D_PCTR(x)  (0x00680 + ((x) * 8))
> +#define V3D_PCTRS(x) (0x00684 + ((x) * 8))
>  #define V3D_DBGE     0x00f00
>  #define V3D_FDBGO    0x00f04
>  #define V3D_FDBGB    0x00f08
> diff --git a/drivers/gpu/drm/vc4/vc4_v3d.c b/drivers/gpu/drm/vc4/vc4_v3d.c
> index 622cd43840b8..35c00050d18b 100644
> --- a/drivers/gpu/drm/vc4/vc4_v3d.c
> +++ b/drivers/gpu/drm/vc4/vc4_v3d.c
> @@ -68,38 +68,38 @@ static const struct {
>  	REGDEF(V3D_VPMBASE),
>  	REGDEF(V3D_PCTRC),
>  	REGDEF(V3D_PCTRE),
> -	REGDEF(V3D_PCTR0),
> -	REGDEF(V3D_PCTRS0),
> -	REGDEF(V3D_PCTR1),
> -	REGDEF(V3D_PCTRS1),
> -	REGDEF(V3D_PCTR2),
> -	REGDEF(V3D_PCTRS2),
> -	REGDEF(V3D_PCTR3),
> -	REGDEF(V3D_PCTRS3),
> -	REGDEF(V3D_PCTR4),
> -	REGDEF(V3D_PCTRS4),
> -	REGDEF(V3D_PCTR5),
> -	REGDEF(V3D_PCTRS5),
> -	REGDEF(V3D_PCTR6),
> -	REGDEF(V3D_PCTRS6),
> -	REGDEF(V3D_PCTR7),
> -	REGDEF(V3D_PCTRS7),
> -	REGDEF(V3D_PCTR8),
> -	REGDEF(V3D_PCTRS8),
> -	REGDEF(V3D_PCTR9),
> -	REGDEF(V3D_PCTRS9),
> -	REGDEF(V3D_PCTR10),
> -	REGDEF(V3D_PCTRS10),
> -	REGDEF(V3D_PCTR11),
> -	REGDEF(V3D_PCTRS11),
> -	REGDEF(V3D_PCTR12),
> -	REGDEF(V3D_PCTRS12),
> -	REGDEF(V3D_PCTR13),
> -	REGDEF(V3D_PCTRS13),
> -	REGDEF(V3D_PCTR14),
> -	REGDEF(V3D_PCTRS14),
> -	REGDEF(V3D_PCTR15),
> -	REGDEF(V3D_PCTRS15),
> +	REGDEF(V3D_PCTR(0)),
> +	REGDEF(V3D_PCTRS(0)),
> +	REGDEF(V3D_PCTR(1)),
> +	REGDEF(V3D_PCTRS(1)),
> +	REGDEF(V3D_PCTR(2)),
> +	REGDEF(V3D_PCTRS(2)),
> +	REGDEF(V3D_PCTR(3)),
> +	REGDEF(V3D_PCTRS(3)),
> +	REGDEF(V3D_PCTR(4)),
> +	REGDEF(V3D_PCTRS(4)),
> +	REGDEF(V3D_PCTR(5)),
> +	REGDEF(V3D_PCTRS(5)),
> +	REGDEF(V3D_PCTR(6)),
> +	REGDEF(V3D_PCTRS(6)),
> +	REGDEF(V3D_PCTR(7)),
> +	REGDEF(V3D_PCTRS(7)),
> +	REGDEF(V3D_PCTR(8)),
> +	REGDEF(V3D_PCTRS(8)),
> +	REGDEF(V3D_PCTR(9)),
> +	REGDEF(V3D_PCTRS(9)),
> +	REGDEF(V3D_PCTR(10)),
> +	REGDEF(V3D_PCTRS(10)),
> +	REGDEF(V3D_PCTR(11)),
> +	REGDEF(V3D_PCTRS(11)),
> +	REGDEF(V3D_PCTR(12)),
> +	REGDEF(V3D_PCTRS(12)),
> +	REGDEF(V3D_PCTR(13)),
> +	REGDEF(V3D_PCTRS(13)),
> +	REGDEF(V3D_PCTR(14)),
> +	REGDEF(V3D_PCTRS(14)),
> +	REGDEF(V3D_PCTR(15)),
> +	REGDEF(V3D_PCTRS(15)),
>  	REGDEF(V3D_DBGE),
>  	REGDEF(V3D_FDBGO),
>  	REGDEF(V3D_FDBGB),
> diff --git a/include/uapi/drm/vc4_drm.h b/include/uapi/drm/vc4_drm.h
> index 52263b575bdc..324776c3bbac 100644
> --- a/include/uapi/drm/vc4_drm.h
> +++ b/include/uapi/drm/vc4_drm.h
> @@ -42,6 +42,9 @@ extern "C" {
>  #define DRM_VC4_GET_TILING                        0x09
>  #define DRM_VC4_LABEL_BO                          0x0a
>  #define DRM_VC4_GEM_MADVISE                       0x0b
> +#define DRM_VC4_PERFMON_CREATE                    0x0c
> +#define DRM_VC4_PERFMON_DESTROY                   0x0d
> +#define DRM_VC4_PERFMON_GET_VALUES                0x0e
>  
>  #define DRM_IOCTL_VC4_SUBMIT_CL           DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_SUBMIT_CL, struct drm_vc4_submit_cl)
>  #define DRM_IOCTL_VC4_WAIT_SEQNO          DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_WAIT_SEQNO, struct drm_vc4_wait_seqno)
> @@ -55,6 +58,9 @@ extern "C" {
>  #define DRM_IOCTL_VC4_GET_TILING          DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_GET_TILING, struct drm_vc4_get_tiling)
>  #define DRM_IOCTL_VC4_LABEL_BO            DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_LABEL_BO, struct drm_vc4_label_bo)
>  #define DRM_IOCTL_VC4_GEM_MADVISE         DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_GEM_MADVISE, struct drm_vc4_gem_madvise)
> +#define DRM_IOCTL_VC4_PERFMON_CREATE      DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_PERFMON_CREATE, struct drm_vc4_perfmon_create)
> +#define DRM_IOCTL_VC4_PERFMON_DESTROY     DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_PERFMON_DESTROY, struct drm_vc4_perfmon_destroy)
> +#define DRM_IOCTL_VC4_PERFMON_GET_VALUES  DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_PERFMON_GET_VALUES, struct drm_vc4_perfmon_get_values)
>  
>  struct drm_vc4_submit_rcl_surface {
>  	__u32 hindex; /* Handle index, or ~0 if not present. */
> @@ -173,6 +179,15 @@ struct drm_vc4_submit_cl {
>  	 * wait ioctl).
>  	 */
>  	__u64 seqno;
> +
> +	/* ID of the perfmon to attach to this job. 0 means no perfmon. */
> +	__u32 perfmonid;
> +
> +	/* Unused field to align this struct on 64 bits. Must be set to 0.
> +	 * If one ever needs to add an u32 field to this struct, this field
> +	 * can be used.
> +	 */
> +	__u32 pad2;
>  };
>  
>  /**
> @@ -308,6 +323,7 @@ struct drm_vc4_get_hang_state {
>  #define DRM_VC4_PARAM_SUPPORTS_THREADED_FS	5
>  #define DRM_VC4_PARAM_SUPPORTS_FIXED_RCL_ORDER	6
>  #define DRM_VC4_PARAM_SUPPORTS_MADVISE		7
> +#define DRM_VC4_PARAM_SUPPORTS_PERFMON		8
>  
>  struct drm_vc4_get_param {
>  	__u32 param;
> @@ -352,6 +368,57 @@ struct drm_vc4_gem_madvise {
>  	__u32 pad;
>  };
>  
> +enum {
> +	VC4_PERFCNT_FEP_VALID_PRIMS_NO_RENDER,
> +	VC4_PERFCNT_FEP_VALID_PRIMS_RENDER,
> +	VC4_PERFCNT_FEP_CLIPPED_QUADS,
> +	VC4_PERFCNT_FEP_VALID_QUADS,
> +	VC4_PERFCNT_TLB_QUADS_NOT_PASSING_STENCIL,
> +	VC4_PERFCNT_TLB_QUADS_NOT_PASSING_Z_AND_STENCIL,
> +	VC4_PERFCNT_TLB_QUADS_PASSING_Z_AND_STENCIL,
> +	VC4_PERFCNT_TLB_QUADS_ZERO_COVERAGE,
> +	VC4_PERFCNT_TLB_QUADS_NON_ZERO_COVERAGE,
> +	VC4_PERFCNT_TLB_QUADS_WRITTEN_TO_COLOR_BUF,
> +	VC4_PERFCNT_PLB_PRIMS_OUTSIDE_VIEWPORT,
> +	VC4_PERFCNT_PLB_PRIMS_NEED_CLIPPING,
> +	VC4_PERFCNT_PSE_PRIMS_REVERSED,
> +	VC4_PERFCNT_QPU_TOTAL_IDLE_CYCLES,
> +	VC4_PERFCNT_QPU_TOTAL_CLK_CYCLES_VERTEX_COORD_SHADING,
> +	VC4_PERFCNT_QPU_TOTAL_CLK_CYCLES_FRAGMENT_SHADING,
> +	VC4_PERFCNT_QPU_TOTAL_CLK_CYCLES_EXEC_VALID_INST,
> +	VC4_PERFCNT_QPU_TOTAL_CLK_CYCLES_WAITING_TMUS,
> +	VC4_PERFCNT_QPU_TOTAL_CLK_CYCLES_WAITING_SCOREBOARD,
> +	VC4_PERFCNT_QPU_TOTAL_CLK_CYCLES_WAITING_VARYINGS,
> +	VC4_PERFCNT_QPU_TOTAL_INST_CACHE_HIT,
> +	VC4_PERFCNT_QPU_TOTAL_INST_CACHE_MISS,
> +	VC4_PERFCNT_QPU_TOTAL_UNIFORM_CACHE_HIT,
> +	VC4_PERFCNT_QPU_TOTAL_UNIFORM_CACHE_MISS,
> +	VC4_PERFCNT_TMU_TOTAL_TEXT_QUADS_PROCESSED,
> +	VC4_PERFCNT_TMU_TOTAL_TEXT_CACHE_MISS,
> +	VC4_PERFCNT_VPM_TOTAL_CLK_CYCLES_VDW_STALLED,
> +	VC4_PERFCNT_VPM_TOTAL_CLK_CYCLES_VCD_STALLED,
> +	VC4_PERFCNT_L2C_TOTAL_L2_CACHE_HIT,
> +	VC4_PERFCNT_L2C_TOTAL_L2_CACHE_MISS,
> +	VC4_PERFCNT_NUM_EVENTS,
> +};
> +
> +#define DRM_VC4_MAX_PERF_COUNTERS	16
> +
> +struct drm_vc4_perfmon_create {
> +	__u32 id;
> +	__u32 ncounters;
> +	__u8 events[DRM_VC4_MAX_PERF_COUNTERS];
> +};
> +
> +struct drm_vc4_perfmon_destroy {
> +	__u32 id;
> +};
> +

Could we add some docs for get_values?  Like:

/*
 * Returns the values of the performance counters tracked by this
 * perfmon (as an array of ncounters u64 values).
 *
 * No implicit synchronization is performed, so the user has to
 * guarantee that any jobs using this perfmon have already been
 * completed  (probably by blocking on the seqno returned by the
 * last exec that used the perfmon).
 */

With that,

Reviewed-by: Eric Anholt <eric at anholt.net>

> +struct drm_vc4_perfmon_get_values {
> +	__u32 id;
> +	__u64 values_ptr;
> +};
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 832 bytes
Desc: not available
URL: <https://lists.freedesktop.org/archives/dri-devel/attachments/20180111/b5b7394f/attachment-0001.sig>


More information about the dri-devel mailing list