[Intel-gfx] [PATCH v7 06/11] drm/i915: Enable i915 perf stream for Haswell OA unit

Tue Oct 25 23:05:44 UTC 2016

On Tue, Oct 25, 2016 at 12:19:29AM +0100, Robert Bragg wrote:
> +static int claim_specific_ctx(struct i915_perf_stream *stream)
> +{
> +	struct drm_i915_private *dev_priv = stream->dev_priv;
> +	struct i915_vma *vma;
> +	int ret;
> +
> +	ret = i915_mutex_lock_interruptible(&dev_priv->drm);

Looking forward to the day these don't need struct_mutex.

> +	if (ret)
> +		return ret;
> +
> +	/* So that we don't have to worry about updating the context ID
> +	 * in OACONTOL on the fly we make sure to pin the context
> +	 * upfront for the lifetime of the stream...
> +	 */
> +	vma = stream->ctx->engine[RCS].state;

There's a caveat here that suggests I had better wrap up this into its
own function. (We need to flush dirty cachelines to memory on first
binding of the context.)

> +	ret = i915_vma_pin(vma, 0, stream->ctx->ggtt_alignment,
> +			   PIN_GLOBAL | PIN_HIGH);
> +	if (ret)
> +		return ret;

Oops.

> +
> +	dev_priv->perf.oa.specific_ctx_id = i915_ggtt_offset(vma);
> +
> +	mutex_unlock(&dev_priv->drm.struct_mutex);
> +
> +	return 0;
> +}

> +static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
> +{
> +	struct drm_i915_gem_object *bo;
> +	struct i915_vma *vma;
> +	int ret;
> +
> +	BUG_ON(dev_priv->perf.oa.oa_buffer.vma);
> +
> +	ret = i915_mutex_lock_interruptible(&dev_priv->drm);
> +	if (ret)
> +		return ret;
> +
> +	BUILD_BUG_ON_NOT_POWER_OF_2(OA_BUFFER_SIZE);
> +	BUILD_BUG_ON(OA_BUFFER_SIZE < SZ_128K || OA_BUFFER_SIZE > SZ_16M);
> +
> +	bo = i915_gem_object_create(&dev_priv->drm, OA_BUFFER_SIZE);
> +	if (IS_ERR(bo)) {
> +		DRM_ERROR("Failed to allocate OA buffer\n");
> +		ret = PTR_ERR(bo);
> +		goto unlock;
> +	}
> +
> +	ret = i915_gem_object_set_cache_level(bo, I915_CACHE_LLC);
> +	if (ret)
> +		goto err_unref;
> +
> +	/* PreHSW required 512K alignment, HSW requires 16M */
> +	vma = i915_gem_object_ggtt_pin(bo, NULL, 0, SZ_16M, PIN_MAPPABLE);

Does this need mappable aperture space for OA? You aren't accessing it
via the aperture, but is the hw limited to it?

> +	if (IS_ERR(vma)) {
> +		ret = PTR_ERR(vma);
> +		goto err_unref;
> +	}
> +	dev_priv->perf.oa.oa_buffer.vma = vma;
> +
> +	dev_priv->perf.oa.oa_buffer.vaddr =
> +		i915_gem_object_pin_map(bo, I915_MAP_WB);
> +	if (IS_ERR(dev_priv->perf.oa.oa_buffer.vaddr)) {
> +		ret = PTR_ERR(dev_priv->perf.oa.oa_buffer.vaddr);
> +		goto err_unpin;
> +	}
> +
> +	dev_priv->perf.oa.ops.init_oa_buffer(dev_priv);
> +
> +	DRM_DEBUG_DRIVER("OA Buffer initialized, gtt offset = 0x%x, vaddr = %p",
> +			 i915_ggtt_offset(dev_priv->perf.oa.oa_buffer.vma),
> +			 dev_priv->perf.oa.oa_buffer.vaddr);
> +
> +	goto unlock;
> +
> +err_unpin:
> +	__i915_vma_unpin(vma);
> +
> +err_unref:
> +	i915_gem_object_put(bo);
> +
> +	dev_priv->perf.oa.oa_buffer.vaddr = NULL;
> +	dev_priv->perf.oa.oa_buffer.vma = NULL;
> +
> +unlock:
> +	mutex_unlock(&dev_priv->drm.struct_mutex);
> +	return ret;
> +}

> +	if (ret >= 0) {
> +		/* Maybe make ->pollin per-stream state if we support multiple
> +		 * concurrent streams in the future. */
> +		atomic_set(&dev_priv->perf.oa.pollin, false);
> +	}
> +
>  	return ret;
>  }
>  
> -static unsigned int i915_perf_poll_locked(struct i915_perf_stream *stream,
> +static enum hrtimer_restart oa_poll_check_timer_cb(struct hrtimer *hrtimer)
> +{
> +	struct drm_i915_private *dev_priv =
> +		container_of(hrtimer, typeof(*dev_priv),
> +			     perf.oa.poll_check_timer);
> +
> +	if (!dev_priv->perf.oa.ops.oa_buffer_is_empty(dev_priv)) {
> +		atomic_set(&dev_priv->perf.oa.pollin, true);
> +		wake_up(&dev_priv->perf.oa.poll_wq);
> +	}
> +
> +	hrtimer_forward_now(hrtimer, ns_to_ktime(POLL_PERIOD));
> +
> +	return HRTIMER_RESTART;
> +}
> +
> +static unsigned int i915_perf_poll_locked(struct drm_i915_private *dev_priv,
> +					  struct i915_perf_stream *stream,
>  					  struct file *file,
>  					  poll_table *wait)
>  {
> -	unsigned int streams = 0;
> +	unsigned int events = 0;
>  
>  	stream->ops->poll_wait(stream, file, wait);
>  
> -	if (stream->ops->can_read(stream))
> -		streams |= POLLIN;
> +	/* Note: we don't explicitly check whether there's something to read
> +	 * here since this path may be very hot depending on what else
> +	 * userspace is polling, or on the timeout in use. We rely solely on
> +	 * the hrtimer/oa_poll_check_timer_cb to notify us when there are
> +	 * samples to read.
> +	 */
> +	if (atomic_read(&dev_priv->perf.oa.pollin))
> +		events |= POLLIN;

The atomic_set() and atomic_read() are superfluous, they don't even
impose any memory barriers. The required barrier here is from wake_up().

You can just use dev_priv->perf.ao.pollin = true; WRITE_ONCE() /
READ_ONCE() if you want to clearly show that it is outside of the lock
and barriers are imposed elsewhere.

> @@ -285,18 +1177,18 @@ i915_perf_open_ioctl_locked(struct drm_i915_private *dev_priv,
> +	/* we avoid simply assigning stream->sample_flags = props->sample_flags
> +	 * to have _stream_init check the combination of sample flags more
> +	 * thoroughly, but still this is the expected result at this point.
>  	 */
> -	DRM_ERROR("Unsupported i915 perf stream configuration\n");
> -	ret = -EINVAL;
> -	goto err_alloc;
> +	BUG_ON(stream->sample_flags != props->sample_flags);

	if (WARN_ON(...)) {
		ret = -ENODEV;
		goto err_alloc;
	}

just to avoid checkpatch complaining.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre