[Intel-gfx] [PATCH] drm/i915: Remove partial attempt to swizzle on pread/pwrite

Thu Oct 11 14:44:19 UTC 2018

On 11/10/2018 15:30, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2018-10-11 14:38:41)
>>
>> On 23/07/2018 10:02, Chris Wilson wrote:
>>> Our attempt to account for bit17 swizzling of pread/pwrite onto tiled
>>> objects was flawed due to the simple fact that we do not always know the
>>> swizzling for a particular page (due to the swizzling varying based on
>>> location in certain unbalanced configurations). Furthermore, the
>>> pread/pwrite paths are now unbalanced in that we are required to use the
>>> GTT as in some cases we do not have direct CPU access to the backing
>>> physical pages (thus some paths trying to account for the swizzle, but
>>> others neglecting, chaos ensues).
>>>
>>> There are no known users who do use pread/pwrite into a tiled object
>>> (you need to manually detile anyway, so why now just use mmap and avoid
>>> the copy?) and no user bug reports to indicate that it is being used in
>>> the wild. As no one is hitting the buggy path, we can just remove the
>>> buggy code.
>>>
>>> References: fe115628d567 ("drm/i915: Implement pwrite without struct-mutex")
>>> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
>>> Cc: Joonas Lahtinen <joonas.lahtinen at linux.intel.com>
>>> ---
>>>    drivers/gpu/drm/i915/i915_gem.c | 191 +++++---------------------------
>>>    1 file changed, 30 insertions(+), 161 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
>>> index 8b52cb768a67..73a69352e0d4 100644
>>> --- a/drivers/gpu/drm/i915/i915_gem.c
>>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>>> @@ -859,58 +859,6 @@ flush_write_domain(struct drm_i915_gem_object *obj, unsigned int flush_domains)
>>>        obj->write_domain = 0;
>>>    }
>>>    
>>> -static inline int
>>> -__copy_to_user_swizzled(char __user *cpu_vaddr,
>>> -                     const char *gpu_vaddr, int gpu_offset,
>>> -                     int length)
>>> -{
>>> -     int ret, cpu_offset = 0;
>>> -
>>> -     while (length > 0) {
>>> -             int cacheline_end = ALIGN(gpu_offset + 1, 64);
>>> -             int this_length = min(cacheline_end - gpu_offset, length);
>>> -             int swizzled_gpu_offset = gpu_offset ^ 64;
>>> -
>>> -             ret = __copy_to_user(cpu_vaddr + cpu_offset,
>>> -                                  gpu_vaddr + swizzled_gpu_offset,
>>> -                                  this_length);
>>> -             if (ret)
>>> -                     return ret + length;
>>> -
>>> -             cpu_offset += this_length;
>>> -             gpu_offset += this_length;
>>> -             length -= this_length;
>>> -     }
>>> -
>>> -     return 0;
>>> -}
>>> -
>>> -static inline int
>>> -__copy_from_user_swizzled(char *gpu_vaddr, int gpu_offset,
>>> -                       const char __user *cpu_vaddr,
>>> -                       int length)
>>> -{
>>> -     int ret, cpu_offset = 0;
>>> -
>>> -     while (length > 0) {
>>> -             int cacheline_end = ALIGN(gpu_offset + 1, 64);
>>> -             int this_length = min(cacheline_end - gpu_offset, length);
>>> -             int swizzled_gpu_offset = gpu_offset ^ 64;
>>> -
>>> -             ret = __copy_from_user(gpu_vaddr + swizzled_gpu_offset,
>>> -                                    cpu_vaddr + cpu_offset,
>>> -                                    this_length);
>>> -             if (ret)
>>> -                     return ret + length;
>>> -
>>> -             cpu_offset += this_length;
>>> -             gpu_offset += this_length;
>>> -             length -= this_length;
>>> -     }
>>> -
>>> -     return 0;
>>> -}
>>> -
>>>    /*
>>>     * Pins the specified object's pages and synchronizes the object with
>>>     * GPU accesses. Sets needs_clflush to non-zero if the caller should
>>> @@ -1030,72 +978,29 @@ int i915_gem_obj_prepare_shmem_write(struct drm_i915_gem_object *obj,
>>>        return ret;
>>>    }
>>>    
>>> -static void
>>> -shmem_clflush_swizzled_range(char *addr, unsigned long length,
>>> -                          bool swizzled)
>>> -{
>>> -     if (unlikely(swizzled)) {
>>> -             unsigned long start = (unsigned long) addr;
>>> -             unsigned long end = (unsigned long) addr + length;
>>> -
>>> -             /* For swizzling simply ensure that we always flush both
>>> -              * channels. Lame, but simple and it works. Swizzled
>>> -              * pwrite/pread is far from a hotpath - current userspace
>>> -              * doesn't use it at all. */
>>> -             start = round_down(start, 128);
>>> -             end = round_up(end, 128);
>>> -
>>> -             drm_clflush_virt_range((void *)start, end - start);
>>> -     } else {
>>> -             drm_clflush_virt_range(addr, length);
>>> -     }
>>> -
>>> -}
>>> -
>>> -/* Only difference to the fast-path function is that this can handle bit17
>>> - * and uses non-atomic copy and kmap functions. */
>>>    static int
>>> -shmem_pread_slow(struct page *page, int offset, int length,
>>> -              char __user *user_data,
>>> -              bool page_do_bit17_swizzling, bool needs_clflush)
>>> +shmem_pread(struct page *page, int offset, int len, char __user *user_data,
>>> +         bool needs_clflush)
>>>    {
>>>        char *vaddr;
>>>        int ret;
>>>    
>>> -     vaddr = kmap(page);
>>> -     if (needs_clflush)
>>> -             shmem_clflush_swizzled_range(vaddr + offset, length,
>>> -                                          page_do_bit17_swizzling);
>>> -
>>> -     if (page_do_bit17_swizzling)
>>> -             ret = __copy_to_user_swizzled(user_data, vaddr, offset, length);
>>> -     else
>>> -             ret = __copy_to_user(user_data, vaddr + offset, length);
>>> -     kunmap(page);
>>> +     vaddr = kmap_atomic(page);
>>>    
>>> -     return ret ? - EFAULT : 0;
>>> -}
>>> +     if (needs_clflush)
>>> +             drm_clflush_virt_range(vaddr + offset, len);
>>>    
>>> -static int
>>> -shmem_pread(struct page *page, int offset, int length, char __user *user_data,
>>> -         bool page_do_bit17_swizzling, bool needs_clflush)
>>> -{
>>> -     int ret;
>>> +     ret = __copy_to_user_inatomic(user_data, vaddr + offset, len);
>>
>> Kerneldoc says userspace memory must be pinned so potential page faults
>> are avoided. I don't see that our code does that.
> 
> kmap_atomic once upon a time gave assurances that _inatomic was safe.
> It's not important, kmap() is good enough (once upon a time there was a
> major difference for kmap_atomic as you had to use preallocated slots,
> the good old days ;)

kmap_atomic is also the other end of the copy. user_data is which also 
mustn't fault, but never mind, kmap only path sounds indeed fine.

>   
>>> -     ret = -ENODEV;
>>> -     if (!page_do_bit17_swizzling) {
>>> -             char *vaddr = kmap_atomic(page);
>>> +     kunmap_atomic(vaddr);
>>>    
>>> -             if (needs_clflush)
>>> -                     drm_clflush_virt_range(vaddr + offset, length);
>>> -             ret = __copy_to_user_inatomic(user_data, vaddr + offset, length);
>>> -             kunmap_atomic(vaddr);
>>> +     if (ret) {
>>> +             vaddr = kmap(page);
>>> +             ret = __copy_to_user(user_data, vaddr + offset, len);
>>> +             kunmap(page);
>>>        }
>>> -     if (ret == 0)
>>> -             return 0;
>>>    
>>> -     return shmem_pread_slow(page, offset, length, user_data,
>>> -                             page_do_bit17_swizzling, needs_clflush);
>>> +     return ret ? - EFAULT : 0;
>>
>> Why the space between - and EFAULT? Or why not just return ret?
> 
> Fat fingers.
> 
>>>    }
>>>    
>>>    static int
>>> @@ -1104,15 +1009,10 @@ i915_gem_shmem_pread(struct drm_i915_gem_object *obj,
>>>    {
>>>        char __user *user_data;
>>>        u64 remain;
>>> -     unsigned int obj_do_bit17_swizzling;
>>>        unsigned int needs_clflush;
>>>        unsigned int idx, offset;
>>>        int ret;
>>>    
>>> -     obj_do_bit17_swizzling = 0;
>>> -     if (i915_gem_object_needs_bit17_swizzle(obj))
>>> -             obj_do_bit17_swizzling = BIT(17);
>>> -
>>>        ret = mutex_lock_interruptible(&obj->base.dev->struct_mutex);
>>>        if (ret)
>>>                return ret;
>>> @@ -1134,7 +1034,6 @@ i915_gem_shmem_pread(struct drm_i915_gem_object *obj,
>>>                        length = PAGE_SIZE - offset;
>>
>> It's pre-existing, but there seems to be a potential overflow of int
>> length = u64 remain a few lines above this.
> 
> Hmm. If so, that's one that escaped my perusal. Time for sanity checks
> probably.

Feel free to leave it for a separate patch/series, it was just an 
observation. Or even see if you can delegate it. :)

Regards,

Tvrtko