[Intel-gfx] [PATCH v3] drm/i915: Remove partial attempt to swizzle on pread/pwrite

Fri Oct 12 12:22:55 UTC 2018

On 12/10/2018 12:56, Chris Wilson wrote:
> Our attempt to account for bit17 swizzling of pread/pwrite onto tiled
> objects was flawed due to the simple fact that we do not always know the
> swizzling for a particular page (due to the swizzling varying based on
> location in certain unbalanced configurations). Furthermore, the
> pread/pwrite paths are now unbalanced in that we are required to use the
> GTT as in some cases we do not have direct CPU access to the backing
> physical pages (thus some paths trying to account for the swizzle, but
> others neglecting, chaos ensues).
> 
> There are no known users who do use pread/pwrite into a tiled object
> (you need to manually detile anyway, so why now just use mmap and avoid
> the copy?) and no user bug reports to indicate that it is being used in
> the wild. As no one is hitting the buggy path, we can just remove the
> buggy code.
> 
> v2: Just use the fault allowing kmap() + normal copy_(to|from)_user
> v3: Avoid int overflow in computing 'length' from 'remain' (Tvrtko)
> 
> References: fe115628d567 ("drm/i915: Implement pwrite without struct-mutex")
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Joonas Lahtinen <joonas.lahtinen at linux.intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> ---
>   drivers/gpu/drm/i915/i915_gem.c | 195 ++++----------------------------
>   1 file changed, 25 insertions(+), 170 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 7d45e71100bc..8049458ab6ac 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -859,58 +859,6 @@ flush_write_domain(struct drm_i915_gem_object *obj, unsigned int flush_domains)
>   	obj->write_domain = 0;
>   }
>   
> -static inline int
> -__copy_to_user_swizzled(char __user *cpu_vaddr,
> -			const char *gpu_vaddr, int gpu_offset,
> -			int length)
> -{
> -	int ret, cpu_offset = 0;
> -
> -	while (length > 0) {
> -		int cacheline_end = ALIGN(gpu_offset + 1, 64);
> -		int this_length = min(cacheline_end - gpu_offset, length);
> -		int swizzled_gpu_offset = gpu_offset ^ 64;
> -
> -		ret = __copy_to_user(cpu_vaddr + cpu_offset,
> -				     gpu_vaddr + swizzled_gpu_offset,
> -				     this_length);
> -		if (ret)
> -			return ret + length;
> -
> -		cpu_offset += this_length;
> -		gpu_offset += this_length;
> -		length -= this_length;
> -	}
> -
> -	return 0;
> -}
> -
> -static inline int
> -__copy_from_user_swizzled(char *gpu_vaddr, int gpu_offset,
> -			  const char __user *cpu_vaddr,
> -			  int length)
> -{
> -	int ret, cpu_offset = 0;
> -
> -	while (length > 0) {
> -		int cacheline_end = ALIGN(gpu_offset + 1, 64);
> -		int this_length = min(cacheline_end - gpu_offset, length);
> -		int swizzled_gpu_offset = gpu_offset ^ 64;
> -
> -		ret = __copy_from_user(gpu_vaddr + swizzled_gpu_offset,
> -				       cpu_vaddr + cpu_offset,
> -				       this_length);
> -		if (ret)
> -			return ret + length;
> -
> -		cpu_offset += this_length;
> -		gpu_offset += this_length;
> -		length -= this_length;
> -	}
> -
> -	return 0;
> -}
> -
>   /*
>    * Pins the specified object's pages and synchronizes the object with
>    * GPU accesses. Sets needs_clflush to non-zero if the caller should
> @@ -1030,72 +978,23 @@ int i915_gem_obj_prepare_shmem_write(struct drm_i915_gem_object *obj,
>   	return ret;
>   }
>   
> -static void
> -shmem_clflush_swizzled_range(char *addr, unsigned long length,
> -			     bool swizzled)
> -{
> -	if (unlikely(swizzled)) {
> -		unsigned long start = (unsigned long) addr;
> -		unsigned long end = (unsigned long) addr + length;
> -
> -		/* For swizzling simply ensure that we always flush both
> -		 * channels. Lame, but simple and it works. Swizzled
> -		 * pwrite/pread is far from a hotpath - current userspace
> -		 * doesn't use it at all. */
> -		start = round_down(start, 128);
> -		end = round_up(end, 128);
> -
> -		drm_clflush_virt_range((void *)start, end - start);
> -	} else {
> -		drm_clflush_virt_range(addr, length);
> -	}
> -
> -}
> -
> -/* Only difference to the fast-path function is that this can handle bit17
> - * and uses non-atomic copy and kmap functions. */
>   static int
> -shmem_pread_slow(struct page *page, int offset, int length,
> -		 char __user *user_data,
> -		 bool page_do_bit17_swizzling, bool needs_clflush)
> +shmem_pread(struct page *page, int offset, int len, char __user *user_data,
> +	    bool needs_clflush)
>   {
>   	char *vaddr;
>   	int ret;
>   
>   	vaddr = kmap(page);
> -	if (needs_clflush)
> -		shmem_clflush_swizzled_range(vaddr + offset, length,
> -					     page_do_bit17_swizzling);
>   
> -	if (page_do_bit17_swizzling)
> -		ret = __copy_to_user_swizzled(user_data, vaddr, offset, length);
> -	else
> -		ret = __copy_to_user(user_data, vaddr + offset, length);
> -	kunmap(page);
> -
> -	return ret ? - EFAULT : 0;
> -}
> -
> -static int
> -shmem_pread(struct page *page, int offset, int length, char __user *user_data,
> -	    bool page_do_bit17_swizzling, bool needs_clflush)
> -{
> -	int ret;
> +	if (needs_clflush)
> +		drm_clflush_virt_range(vaddr + offset, len);
>   
> -	ret = -ENODEV;
> -	if (!page_do_bit17_swizzling) {
> -		char *vaddr = kmap_atomic(page);
> +	ret = __copy_to_user(user_data, vaddr + offset, len);
>   
> -		if (needs_clflush)
> -			drm_clflush_virt_range(vaddr + offset, length);
> -		ret = __copy_to_user_inatomic(user_data, vaddr + offset, length);
> -		kunmap_atomic(vaddr);
> -	}
> -	if (ret == 0)
> -		return 0;
> +	kunmap(page);
>   
> -	return shmem_pread_slow(page, offset, length, user_data,
> -				page_do_bit17_swizzling, needs_clflush);
> +	return ret ? -EFAULT : 0;
>   }
>   
>   static int
> @@ -1104,15 +1003,10 @@ i915_gem_shmem_pread(struct drm_i915_gem_object *obj,
>   {
>   	char __user *user_data;
>   	u64 remain;
> -	unsigned int obj_do_bit17_swizzling;
>   	unsigned int needs_clflush;
>   	unsigned int idx, offset;
>   	int ret;
>   
> -	obj_do_bit17_swizzling = 0;
> -	if (i915_gem_object_needs_bit17_swizzle(obj))
> -		obj_do_bit17_swizzling = BIT(17);
> -
>   	ret = mutex_lock_interruptible(&obj->base.dev->struct_mutex);
>   	if (ret)
>   		return ret;
> @@ -1127,14 +1021,14 @@ i915_gem_shmem_pread(struct drm_i915_gem_object *obj,
>   	offset = offset_in_page(args->offset);
>   	for (idx = args->offset >> PAGE_SHIFT; remain; idx++) {
>   		struct page *page = i915_gem_object_get_page(obj, idx);
> -		int length;
> +		unsigned int length;
>   
> -		length = remain;
> -		if (offset + length > PAGE_SIZE)
> +		if (remain > PAGE_SIZE - offset)
>   			length = PAGE_SIZE - offset;
> +		else
> +			length = remain;
>   
>   		ret = shmem_pread(page, offset, length, user_data,
> -				  page_to_phys(page) & obj_do_bit17_swizzling,
>   				  needs_clflush);
>   		if (ret)
>   			break;
> @@ -1475,33 +1369,6 @@ i915_gem_gtt_pwrite_fast(struct drm_i915_gem_object *obj,
>   	return ret;
>   }
>   
> -static int
> -shmem_pwrite_slow(struct page *page, int offset, int length,
> -		  char __user *user_data,
> -		  bool page_do_bit17_swizzling,
> -		  bool needs_clflush_before,
> -		  bool needs_clflush_after)
> -{
> -	char *vaddr;
> -	int ret;
> -
> -	vaddr = kmap(page);
> -	if (unlikely(needs_clflush_before || page_do_bit17_swizzling))
> -		shmem_clflush_swizzled_range(vaddr + offset, length,
> -					     page_do_bit17_swizzling);
> -	if (page_do_bit17_swizzling)
> -		ret = __copy_from_user_swizzled(vaddr, offset, user_data,
> -						length);
> -	else
> -		ret = __copy_from_user(vaddr + offset, user_data, length);
> -	if (needs_clflush_after)
> -		shmem_clflush_swizzled_range(vaddr + offset, length,
> -					     page_do_bit17_swizzling);
> -	kunmap(page);
> -
> -	return ret ? -EFAULT : 0;
> -}
> -
>   /* Per-page copy function for the shmem pwrite fastpath.
>    * Flushes invalid cachelines before writing to the target if
>    * needs_clflush_before is set and flushes out any written cachelines after
> @@ -1509,31 +1376,24 @@ shmem_pwrite_slow(struct page *page, int offset, int length,
>    */
>   static int
>   shmem_pwrite(struct page *page, int offset, int len, char __user *user_data,
> -	     bool page_do_bit17_swizzling,
>   	     bool needs_clflush_before,
>   	     bool needs_clflush_after)
>   {
> +	char *vaddr;
>   	int ret;
>   
> -	ret = -ENODEV;
> -	if (!page_do_bit17_swizzling) {
> -		char *vaddr = kmap_atomic(page);
> +	vaddr = kmap(page);
>   
> -		if (needs_clflush_before)
> -			drm_clflush_virt_range(vaddr + offset, len);
> -		ret = __copy_from_user_inatomic(vaddr + offset, user_data, len);
> -		if (needs_clflush_after)
> -			drm_clflush_virt_range(vaddr + offset, len);
> +	if (needs_clflush_before)
> +		drm_clflush_virt_range(vaddr + offset, len);
>   
> -		kunmap_atomic(vaddr);
> -	}
> -	if (ret == 0)
> -		return ret;
> +	ret = __copy_from_user(vaddr + offset, user_data, len);
> +	if (!ret && needs_clflush_after)
> +		drm_clflush_virt_range(vaddr + offset, len);
> +
> +	kunmap(page);
>   
> -	return shmem_pwrite_slow(page, offset, len, user_data,
> -				 page_do_bit17_swizzling,
> -				 needs_clflush_before,
> -				 needs_clflush_after);
> +	return ret ? -EFAULT : 0;
>   }
>   
>   static int
> @@ -1543,7 +1403,6 @@ i915_gem_shmem_pwrite(struct drm_i915_gem_object *obj,
>   	struct drm_i915_private *i915 = to_i915(obj->base.dev);
>   	void __user *user_data;
>   	u64 remain;
> -	unsigned int obj_do_bit17_swizzling;
>   	unsigned int partial_cacheline_write;
>   	unsigned int needs_clflush;
>   	unsigned int offset, idx;
> @@ -1558,10 +1417,6 @@ i915_gem_shmem_pwrite(struct drm_i915_gem_object *obj,
>   	if (ret)
>   		return ret;
>   
> -	obj_do_bit17_swizzling = 0;
> -	if (i915_gem_object_needs_bit17_swizzle(obj))
> -		obj_do_bit17_swizzling = BIT(17);
> -
>   	/* If we don't overwrite a cacheline completely we need to be
>   	 * careful to have up-to-date data by first clflushing. Don't
>   	 * overcomplicate things and flush the entire patch.
> @@ -1575,14 +1430,14 @@ i915_gem_shmem_pwrite(struct drm_i915_gem_object *obj,
>   	offset = offset_in_page(args->offset);
>   	for (idx = args->offset >> PAGE_SHIFT; remain; idx++) {
>   		struct page *page = i915_gem_object_get_page(obj, idx);
> -		int length;
> +		unsigned int length;
>   
> -		length = remain;
> -		if (offset + length > PAGE_SIZE)
> +		if (remain > PAGE_SIZE - offset)
>   			length = PAGE_SIZE - offset;
> +		else
> +			length = remain;
>   
>   		ret = shmem_pwrite(page, offset, length, user_data,
> -				   page_to_phys(page) & obj_do_bit17_swizzling,
>   				   (offset | length) & partial_cacheline_write,
>   				   needs_clflush & CLFLUSH_AFTER);
>   		if (ret)
> 

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>

But as mentioned before please gather some more acks concerning the uAPI 
  removal. Well not uAPI removal but behaviour/functionality change.

Regards,

Tvrtko