[Intel-gfx] [PATCH v2 2/2] drm/i915: Perform object clflushing asynchronously

Tue Feb 21 14:49:56 UTC 2017

On ma, 2017-02-20 at 20:59 +0000, Chris Wilson wrote:
> Flushing the cachelines for an object is slow, can be as much as 100ms
> for a large framebuffer. We currently do this under the struct_mutex BKL
> on execution or on pageflip. But now with the ability to add fences to
> obj->resv for both flips and execbuf (and we naturally wait on the fence
> before CPU access), we can move the clflush operation to a workqueue and
> signal a fence for completion, thereby doing the work asynchronously and
> not blocking the driver or its clients.
> 
> Suggested-by: Akash Goel <akash.goel at intel.com>
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Joonas Lahtinen <joonas.lahtinen at linux.intel.com>
> Cc: Matthew Auld <matthew.auld at intel.com>

<SNIP>

> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -3389,7 +3389,13 @@ int i915_gem_reset_prepare(struct drm_i915_private *dev_priv);
>  void i915_gem_reset(struct drm_i915_private *dev_priv);
>  void i915_gem_reset_finish(struct drm_i915_private *dev_priv);
>  void i915_gem_set_wedged(struct drm_i915_private *dev_priv);
> -void i915_gem_clflush_object(struct drm_i915_gem_object *obj, bool force);

i915_gem_clflush.h

> +
> +void i915_gem_clflush_init(struct drm_i915_private *i915);
> +void i915_gem_clflush_object(struct drm_i915_gem_object *obj,
> > +			     unsigned int flags);
> +#define I915_CLFLUSH_FORCE BIT(0)
> +#define I915_CLFLUSH_SYNC BIT(1)
> +
>  void i915_gem_init_mmio(struct drm_i915_private *i915);
>  int __must_check i915_gem_init(struct drm_i915_private *dev_priv);
>  int __must_check i915_gem_init_hw(struct drm_i915_private *dev_priv);

<SNIP>

> +static const char *i915_clflush_get_driver_name(struct dma_fence *fence)
> +{
> +	return "i915";

Why not DRIVER_NAME?

> +static void i915_clflush_release(struct dma_fence *fence)
> +{
> +	struct clflushf *f = container_of(fence, typeof(*f), dma);

Better variable and struct name needed.

> +
> +	i915_sw_fence_fini(&f->wait);
> +
> +	BUILD_BUG_ON(offsetof(typeof(*f), dma));

Maybe make a comment at the struct definition, too?

> +static bool cpu_cache_is_coherent(struct drm_device *dev,
> +				  enum i915_cache_level level)
> +{
> +	return HAS_LLC(to_i915(dev)) || level != I915_CACHE_NONE;
> +}

We're not using this elsewhere?

> +
> +static void __i915_do_clflush(struct drm_i915_gem_object *obj)
> +{
> +	drm_clflush_sg(obj->mm.pages);
> +	obj->cache_dirty = false;
> +
> +	intel_fb_obj_flush(obj, false, ORIGIN_CPU);

This being hardcoded, use of ORIGIN_DIRTYFB disappears. Nobody is going
to miss it?

> +}
> +
> +static void i915_clflush_work(struct work_struct *work)
> +{
> +	struct clflushf *f = container_of(work, typeof(*f), work);
> +	struct drm_i915_gem_object *obj = f->obj;
> +
> +	if (i915_gem_object_pin_pages(obj)) {
> +		obj->cache_dirty = true;
> +		goto out;
> +	}

For the time being, I'd just WARN here.

> +
> +	__i915_do_clflush(obj);
> +
> +	i915_gem_object_unpin_pages(obj);
> +
> +out:
> +	i915_gem_object_put(obj);
> +
> +	dma_fence_signal(&f->dma);
> +	dma_fence_put(&f->dma);
> +}
> +

<SNIP>

> @@ -14279,15 +14282,11 @@ static int intel_user_framebuffer_dirty(struct drm_framebuffer *fb,
>  					struct drm_clip_rect *clips,
>  					unsigned num_clips)
>  {
> -	struct drm_device *dev = fb->dev;
>  	struct intel_framebuffer *intel_fb = to_intel_framebuffer(fb);
>  	struct drm_i915_gem_object *obj = intel_fb->obj;
>  
> -	mutex_lock(&dev->struct_mutex);
>  	if (obj->pin_display && obj->cache_dirty)
> -		i915_gem_clflush_object(obj, true);
> -	intel_fb_obj_flush(obj, false, ORIGIN_DIRTYFB);
> -	mutex_unlock(&dev->struct_mutex);
> +		i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE);

You removed mutex_lock, add READ_ONCE for code coherency.

With above addressed;

Reviewed-by: Joonas Lahtinen <joonas.lahtinen at linux.intel.com>

Regards, Joonas

>  
>  	return 0;
>  }
-- 
Joonas Lahtinen
Open Source Technology Center
Intel Corporation