[Intel-gfx] [PATCH 3/8] drm/i915: Extract i915_gem_obj_prepare_shmem_write()

Thu Jun 9 12:47:55 UTC 2016

On to, 2016-06-09 at 12:29 +0100, Chris Wilson wrote:
> This is a companion to i915_gem_obj_prepare_shmem_read() that prepares
> the backing storage for direct writes. It first serialises with the GPU,
> pins the backing storage and then indicates what clfushes are required in
> order for the writes to be coherent.
> 
> Whilst here, fix support for ancient CPUs without clflush for which we
> cannot do the GTT+clflush tricks.
> 

Could add here that WARN is kicked

> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_cmd_parser.c |   2 +-
>  drivers/gpu/drm/i915/i915_drv.h        |   6 +-
>  drivers/gpu/drm/i915/i915_gem.c        | 119 +++++++++++++++++++++------------
>  3 files changed, 83 insertions(+), 44 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_cmd_parser.c b/drivers/gpu/drm/i915/i915_cmd_parser.c
> index b0fd6a7b0603..de038da6c01b 100644
> --- a/drivers/gpu/drm/i915/i915_cmd_parser.c
> +++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
> @@ -970,7 +970,7 @@ static u32 *copy_batch(struct drm_i915_gem_object *dest_obj,
>  		       u32 batch_start_offset,
>  		       u32 batch_len)
>  {
> -	int needs_clflush = 0;
> +	unsigned needs_clflush;

Mildly irritated by innuendo to boolean. Not sure why not just "flags".

>  	void *src_base, *src;
>  	void *dst = NULL;
>  	int ret;
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index b333cf4923bc..ef321f84e5fc 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -3026,7 +3026,11 @@ void i915_gem_release_all_mmaps(struct drm_i915_private *dev_priv);
>  void i915_gem_release_mmap(struct drm_i915_gem_object *obj);
>  
>  int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj,
> -				    int *needs_clflush);
> +				    unsigned *needs_clflush);
> +int i915_gem_obj_prepare_shmem_write(struct drm_i915_gem_object *obj,
> +				     unsigned *needs_clflush);
> +#define CLFLUSH_BEFORE 0x1
> +#define CLFLUSH_AFTER 0x2
>  
>  int __must_check i915_gem_object_get_pages(struct drm_i915_gem_object *obj);
>  
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index a41fa01eb024..c6d06cb21191 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -563,34 +563,95 @@ __copy_from_user_swizzled(char *gpu_vaddr, int gpu_offset,
>   * flush the object from the CPU cache.
>   */
>  int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj,
> -				    int *needs_clflush)
> +				    unsigned *needs_clflush)
>  {
>  	int ret;
>  
>  	*needs_clflush = 0;
> -
> -	if (WARN_ON((obj->ops->flags & I915_GEM_OBJECT_HAS_STRUCT_PAGE) == 0))
> +	if ((obj->ops->flags & I915_GEM_OBJECT_HAS_STRUCT_PAGE) == 0)

Why no more WARN?

>  		return -EINVAL;
>  
>  	if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU)) {
> +		ret = i915_gem_object_wait_rendering(obj, true);
> +		if (ret)
> +			return ret;
> +
>  		/* If we're not in the cpu read domain, set ourself into the gtt
>  		 * read domain and manually flush cachelines (if required). This
>  		 * optimizes for the case when the gpu will dirty the data
>  		 * anyway again before the next pread happens. */
>  		*needs_clflush = !cpu_cache_is_coherent(obj->base.dev,
>  							obj->cache_level);

Maybe add CLFLUSH_NONE and use ?:, this is deceiving now that it's not
boolean, and that's why the name should be changed too.

> -		ret = i915_gem_object_wait_rendering(obj, true);
> +	}
> +
> +	ret = i915_gem_object_get_pages(obj);
> +	if (ret)
> +		return ret;
> +
> +	i915_gem_object_pin_pages(obj);
> +
> +	if (*needs_clflush && !boot_cpu_has(X86_FEATURE_CLFLUSH)) {

Your own comment applies here.

> +		ret = i915_gem_object_set_to_cpu_domain(obj, false);
> +		if (ret) {
> +			i915_gem_object_unpin_pages(obj);
> +			return ret;
> +		}
> +		*needs_clflush = 0;
> +	}
> +
> +	return 0;
> +}
> +
> +int i915_gem_obj_prepare_shmem_write(struct drm_i915_gem_object *obj,
> +				    unsigned *needs_clflush)
> +{
> +	int ret;
> +
> +	*needs_clflush = 0;
> +	if ((obj->ops->flags & I915_GEM_OBJECT_HAS_STRUCT_PAGE) == 0)
> +		return -EINVAL;
> +
> +	if (obj->base.write_domain != I915_GEM_DOMAIN_CPU) {
> +		ret = i915_gem_object_wait_rendering(obj, false);
>  		if (ret)
>  			return ret;
> +
> +		/* If we're not in the cpu write domain, set ourself into the
> +		 * gtt write domain and manually flush cachelines (as required).
> +		 * This optimizes for the case when the gpu will use the data
> +		 * right away and we therefore have to clflush anyway.
> +		 */
> +		*needs_clflush |= cpu_write_needs_clflush(obj) << 1;
>  	}
>  
> +	/* Same trick applies to invalidate partially written cachelines read
> +	 * before writing.
> +	 */
> +	if ((obj->base.read_domains & I915_GEM_DOMAIN_CPU) == 0)
> +		*needs_clflush |= !cpu_cache_is_coherent(obj->base.dev,
> +							 obj->cache_level);
> +
>  	ret = i915_gem_object_get_pages(obj);
>  	if (ret)
>  		return ret;
>  
>  	i915_gem_object_pin_pages(obj);
>  
> -	return ret;
> +	if (*needs_clflush && !boot_cpu_has(X86_FEATURE_CLFLUSH)) {
> +		ret = i915_gem_object_set_to_cpu_domain(obj, true);
> +		if (ret) {
> +			i915_gem_object_unpin_pages(obj);
> +			return ret;
> +		}
> +		*needs_clflush = 0;
> +	}

Relatively large common mid-section in these two, static func?

Otherwise looks fine to me, those explained or fixed;

Reviewed-by: Joonas Lahtinen <joonas.lahtinen at linux.intel.com>

Regards, Joonas

> +
> +	if ((*needs_clflush & CLFLUSH_AFTER) == 0)
> +		obj->cache_dirty = true;
> +
> +	intel_fb_obj_invalidate(obj, ORIGIN_CPU);
> +	obj->dirty = 1;
> +	return 0;
> + }
>  
>  /* Per-page copy function for the shmem pread fastpath.
> @@ -999,41 +1060,17 @@ i915_gem_shmem_pwrite(struct drm_device *dev,
>  	int shmem_page_offset, page_length, ret = 0;
>  	int obj_do_bit17_swizzling, page_do_bit17_swizzling;
>  	int hit_slowpath = 0;
> -	int needs_clflush_after = 0;
> -	int needs_clflush_before = 0;
> +	unsigned needs_clflush;
>  	struct sg_page_iter sg_iter;
>  
> -	user_data = u64_to_user_ptr(args->data_ptr);
> -	remain = args->size;
> -
> -	obj_do_bit17_swizzling = i915_gem_object_needs_bit17_swizzle(obj);
> -
> -	if (obj->base.write_domain != I915_GEM_DOMAIN_CPU) {
> -		/* If we're not in the cpu write domain, set ourself into the gtt
> -		 * write domain and manually flush cachelines (if required). This
> -		 * optimizes for the case when the gpu will use the data
> -		 * right away and we therefore have to clflush anyway. */
> -		needs_clflush_after = cpu_write_needs_clflush(obj);
> -		ret = i915_gem_object_wait_rendering(obj, false);
> -		if (ret)
> -			return ret;
> -	}
> -	/* Same trick applies to invalidate partially written cachelines read
> -	 * before writing. */
> -	if ((obj->base.read_domains & I915_GEM_DOMAIN_CPU) == 0)
> -		needs_clflush_before =
> -			!cpu_cache_is_coherent(dev, obj->cache_level);
> -
> -	ret = i915_gem_object_get_pages(obj);
> +	ret = i915_gem_obj_prepare_shmem_write(obj, &needs_clflush);
>  	if (ret)
>  		return ret;
>  
> -	intel_fb_obj_invalidate(obj, ORIGIN_CPU);
> -
> -	i915_gem_object_pin_pages(obj);
> -
> +	obj_do_bit17_swizzling = i915_gem_object_needs_bit17_swizzle(obj);
> +	user_data = u64_to_user_ptr(args->data_ptr);
>  	offset = args->offset;
> -	obj->dirty = 1;
> +	remain = args->size;
>  
>  	for_each_sg_page(obj->pages->sgl, &sg_iter, obj->pages->nents,
>  			 offset >> PAGE_SHIFT) {
> @@ -1057,7 +1094,7 @@ i915_gem_shmem_pwrite(struct drm_device *dev,
>  		/* If we don't overwrite a cacheline completely we need to be
>  		 * careful to have up-to-date data by first clflushing. Don't
>  		 * overcomplicate things and flush the entire patch. */
> -		partial_cacheline_write = needs_clflush_before &&
> +		partial_cacheline_write = needs_clflush & CLFLUSH_BEFORE &&
>  			((shmem_page_offset | page_length)
>  				& (boot_cpu_data.x86_clflush_size - 1));
>  
> @@ -1067,7 +1104,7 @@ i915_gem_shmem_pwrite(struct drm_device *dev,
>  		ret = shmem_pwrite_fast(page, shmem_page_offset, page_length,
>  					user_data, page_do_bit17_swizzling,
>  					partial_cacheline_write,
> -					needs_clflush_after);
> +					needs_clflush & CLFLUSH_AFTER);
>  		if (ret == 0)
>  			goto next_page;
>  
> @@ -1076,7 +1113,7 @@ i915_gem_shmem_pwrite(struct drm_device *dev,
>  		ret = shmem_pwrite_slow(page, shmem_page_offset, page_length,
>  					user_data, page_do_bit17_swizzling,
>  					partial_cacheline_write,
> -					needs_clflush_after);
> +					needs_clflush & CLFLUSH_AFTER);
>  
>  		mutex_lock(&dev->struct_mutex);
>  
> @@ -1098,17 +1135,15 @@ out:
>  		 * cachelines in-line while writing and the object moved
>  		 * out of the cpu write domain while we've dropped the lock.
>  		 */
> -		if (!needs_clflush_after &&
> +		if (!(needs_clflush & CLFLUSH_AFTER) &&
>  		    obj->base.write_domain != I915_GEM_DOMAIN_CPU) {
>  			if (i915_gem_clflush_object(obj, obj->pin_display))
> -				needs_clflush_after = true;
> +				needs_clflush |= CLFLUSH_AFTER;
>  		}
>  	}
>  
> -	if (needs_clflush_after)
> +	if (needs_clflush & CLFLUSH_AFTER)
>  		i915_gem_chipset_flush(to_i915(dev));
> -	else
> -		obj->cache_dirty = true;
>  
>  	intel_fb_obj_flush(obj, false, ORIGIN_CPU);
>  	return ret;
-- 
Joonas Lahtinen
Open Source Technology Center
Intel Corporation