[Intel-gfx] [RFC 2/8] drm/i915: Add mechanism for forwarding the timestamp data through perf

Wed Jul 15 02:40:55 PDT 2015

On Wed, Jul 15, 2015 at 02:21:40PM +0530, sourab.gupta at intel.com wrote:
> From: Sourab Gupta <sourab.gupta at intel.com>
> 
> This patch adds the mechanism for forwarding the timestamp data to
> userspace using the Gen PMU perf event interface.
> 
> The timestamps will be captured in a gem buffer object. The metadata
> information (ctx_id right now) pertaining to snapshot is maintained in a
> list, whose each node has offsets into the gem buffer object for each
> snapshot captured.

What is the definition of ctx_id? The only persistent one is the
user_handle which is only valid within the file_priv namespace. There is
no guid for ctx at the moment.

> In order to track whether the gpu has completed processing the node,
> a field pertaining to corresponding gem request is added. The request is
> expected to be referenced whenever the gpu command is submitted.
> 
> Each snapshot collected is forwarded as a separate perf sample. The perf
> sample will have raw timestamp data followed by metadata information
> pertaining to that sample.
> While forwarding the samples, we check whether the gem request is completed
> and dereference the corresponding request. The need to dereference the
> request necessitates a worker here, which will be scheduled when the
> hrtimer triggers.
> While flushing the samples, we have to wait for the requests already
> scheduled, before forwarding the samples. This wait is done in a lockless
> fashion.
> 
> Signed-off-by: Sourab Gupta <sourab.gupta at intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.h     |  11 ++++
>  drivers/gpu/drm/i915/i915_oa_perf.c | 118 +++++++++++++++++++++++++++++++++++-
>  include/uapi/drm/i915_drm.h         |  10 +++
>  3 files changed, 137 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 7e8e77b..6984150 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1675,6 +1675,13 @@ struct i915_oa_rcs_node {
>  	u32 tag;
>  };
>  
> +struct i915_gen_pmu_node {
> +	struct list_head head;
> +	struct drm_i915_gem_request *req;
> +	u32 offset;
> +	u32 ctx_id;
> +};
> +
>  extern const struct i915_oa_reg i915_oa_3d_mux_config_hsw[];
>  extern const int i915_oa_3d_mux_config_hsw_len;
>  extern const struct i915_oa_reg i915_oa_3d_b_counter_config_hsw[];
> @@ -1996,7 +2003,11 @@ struct drm_i915_private {
>  		struct {
>  			struct drm_i915_gem_object *obj;
>  			u8 *addr;
> +			u32 node_size;
> +			u32 node_count;
>  		} buffer;
> +		struct list_head node_list;
> +		struct work_struct work_timer;
>  	} gen_pmu;
>  
>  	void (*insert_profile_cmd[I915_PROFILE_MAX])
> diff --git a/drivers/gpu/drm/i915/i915_oa_perf.c b/drivers/gpu/drm/i915/i915_oa_perf.c
> index ab965b4..350b560 100644
> --- a/drivers/gpu/drm/i915/i915_oa_perf.c
> +++ b/drivers/gpu/drm/i915/i915_oa_perf.c
> @@ -408,11 +408,108 @@ void forward_oa_rcs_snapshots_work(struct work_struct *__work)
>  	}
>  }
>  
> +int i915_gen_pmu_wait_gpu(struct drm_i915_private *dev_priv)

Why so many exports from this file? And why are half of them privately
named?

> +{
> +	struct i915_gen_pmu_node *last_entry;
> +	unsigned long lock_flags;
> +	int ret;
> +
> +	/*
> +	 * Wait for the last scheduled request to complete. This would
> +	 * implicitly wait for the prior submitted requests. The refcount
> +	 * of the requests is not decremented here.
> +	 */
> +	spin_lock_irqsave(&dev_priv->gen_pmu.lock, lock_flags);

Urm, are we really going to be called in irq context? I really hope you
are not planning on hooking in the irq handlers...

Afaict, you are just using the list from a timer context, so
spin_lock_bh() would be sufficient. Apparently it isn't a timer (thanks
for the misleading name!) so just spin_lock().

> +	if (list_empty(&dev_priv->gen_pmu.node_list)) {
> +		spin_unlock_irqrestore(&dev_priv->gen_pmu.lock, lock_flags);
> +		return 0;
> +	}
> +	last_entry = list_last_entry(&dev_priv->gen_pmu.node_list,
> +			struct i915_gen_pmu_node, head);
> +	spin_unlock_irqrestore(&dev_priv->gen_pmu.lock, lock_flags);

You could write this a little neater with just one path through the
crticial section.

> +	if (last_entry && last_entry->req) {

last_entry cannot be NULL.

When is req NULL? Does the last_entry->req being NULL guarrantee that
all previous req are NULL?

> +		ret = __i915_wait_request(last_entry->req, atomic_read(
> +				&dev_priv->gpu_error.reset_counter),
> +				dev_priv->mm.interruptible, NULL, NULL);

Invalid use of dev_priv->mm.interruptible (just pass true).

> +		if (ret) {
> +			DRM_ERROR("failed to wait\n");
> +			return ret;
> +		}
> +	}
> +	return 0;
> +}
> +
> +static void forward_one_gen_pmu_sample(struct drm_i915_private *dev_priv,
> +				struct i915_gen_pmu_node *node)
> +{
> +	struct perf_sample_data data;
> +	struct perf_event *event = dev_priv->gen_pmu.exclusive_event;
> +	int ts_size, snapshot_size;
> +	u8 *snapshot;
> +	struct drm_i915_ts_node_ctx_id *ctx_info;
> +	struct perf_raw_record raw;
> +
> +	ts_size = sizeof(struct drm_i915_ts_data);
> +	snapshot_size = ts_size + sizeof(*ctx_info);

If you kept these as compile time constants it will make the rest a bit
easier to follow.

> +	snapshot = dev_priv->gen_pmu.buffer.addr + node->offset;
> +
> +	ctx_info = (struct drm_i915_ts_node_ctx_id *)(snapshot + ts_size);
> +	ctx_info->ctx_id = node->ctx_id;
> +
> +	perf_sample_data_init(&data, 0, event->hw.last_period);
> +
> +	/* Note: the combined u32 raw->size member + raw data itself must be 8
> +	 * byte aligned. (See note in init_gen_pmu_buffer for more details) */

You mean this comment?
/* size has to be aligned to 8 bytes (required by relevant gpu cmds) */
That's not particularly enlightening.

Missing BUILD_BUG tests to assert that your structure sizes are what you
claim they should be. To illustrate this comment, you could do

BUILD_BUG_ON(sizeof(struct drm_i915_ts_data) != 8);
BUILD_BUG_ON(sizeof(struct drm_i915_ts_mode_ctx_id) != 8);
BUILD_BUG_ON((snapshot_size + 4 + sizeof(raw.size) % 8) == 0);

Otherwise the comment doesn't really make it clear exactly what has to
be aligned to 8 or *why*.

> +	raw.size = snapshot_size + 4;
> +	raw.data = snapshot;

> +	data.raw = &raw;
> +
> +	perf_event_overflow(event, &data, &dev_priv->gen_pmu.dummy_regs);
> +}
> +
> +void forward_gen_pmu_snapshots_work(struct work_struct *__work)
> +{
> +	struct drm_i915_private *dev_priv =
> +		container_of(__work, typeof(*dev_priv), gen_pmu.work_timer);
> +	struct i915_gen_pmu_node *entry, *next;
> +	struct drm_i915_gem_request *req;
> +	unsigned long lock_flags;
> +	int ret;
> +
> +	list_for_each_entry_safe
> +		(entry, next, &dev_priv->gen_pmu.node_list, head) {
> +		req = entry->req;
> +		if (req && i915_gem_request_completed(req, true)) {

Negate the test and reduce indentation for ease of reading.

> +			forward_one_gen_pmu_sample(dev_priv, entry);
> +			ret = i915_mutex_lock_interruptible(dev_priv->dev);
> +			if (ret)
> +				break;
> +			i915_gem_request_assign(&entry->req, NULL);
> +			mutex_unlock(&dev_priv->dev->struct_mutex);

This is just i915_gem_request_unreference_unlocked().

> +		} else
> +			break;
> +
> +		/*
> +		 * Do we instead need to protect whole loop? If so, we would
> +		 * need to *list_move_tail* to a deferred list, from where
> +		 * i915 device mutex could be taken to deference the requests,
> +		 * and free the node.
> +		 */
> +		spin_lock_irqsave(&dev_priv->gen_pmu.lock, lock_flags);
> +		list_del(&entry->head);
> +		spin_unlock_irqrestore(&dev_priv->gen_pmu.lock, lock_flags);
> +		kfree(entry);
> +	}
> +}
> +
>  static void gen_pmu_flush_snapshots(struct drm_i915_private *dev_priv)
>  {
>  	WARN_ON(!dev_priv->gen_pmu.buffer.addr);
>  
> -	/* TODO: routine for forwarding snapshots to userspace */
> +	schedule_work(&dev_priv->gen_pmu.work_timer);

Why are we scheduling a timer? Might be a bad name for a work item to
infer that it is a timer.

Why is this in a work queue if you already blocked during the flush?

>  }
>  
>  static void
> @@ -761,7 +858,7 @@ static int init_gen_pmu_buffer(struct perf_event *event)
>  	struct drm_i915_private *dev_priv =
>  		container_of(event->pmu, typeof(*dev_priv), gen_pmu.pmu);
>  	struct drm_i915_gem_object *bo;
> -	int ret;
> +	int ret, node_size;
>  
>  	BUG_ON(dev_priv->gen_pmu.buffer.obj);
>  
> @@ -772,6 +869,15 @@ static int init_gen_pmu_buffer(struct perf_event *event)
>  	dev_priv->gen_pmu.buffer.obj = bo;
>  
>  	dev_priv->gen_pmu.buffer.addr = vmap_oa_buffer(bo);
> +	INIT_LIST_HEAD(&dev_priv->gen_pmu.node_list);
> +
> +	node_size = sizeof(struct drm_i915_ts_data) +
> +			sizeof(struct drm_i915_ts_node_ctx_id);
> +
> +	/* size has to be aligned to 8 bytes (required by relevant gpu cmds) */
> +	node_size = ALIGN(node_size, 8);
> +	dev_priv->gen_pmu.buffer.node_size = node_size;
> +	dev_priv->gen_pmu.buffer.node_count = bo->base.size / node_size;
>  
>  	DRM_DEBUG_DRIVER("Gen PMU Buffer initialized, vaddr = %p",
>  			 dev_priv->gen_pmu.buffer.addr);
> @@ -1397,6 +1503,11 @@ static int i915_gen_event_flush(struct perf_event *event)
>  {
>  	struct drm_i915_private *i915 =
>  		container_of(event->pmu, typeof(*i915), gen_pmu.pmu);
> +	int ret;
> +
> +	ret = i915_gen_pmu_wait_gpu(i915);
> +	if (ret)
> +		return ret;
>  
>  	gen_pmu_flush_snapshots(i915);

Wait for idle, then schedule a task???
-Chris
>   

-- 
Chris Wilson, Intel Open Source Technology Centre