[PATCH 15/87] drm/i915: Pack the partial view size and offset into a single u64

Fri Jan 6 15:27:12 UTC 2017

On 06/01/2017 14:21, Chris Wilson wrote:
> On Fri, Jan 06, 2017 at 01:50:27PM +0000, Tvrtko Ursulin wrote:
>>
>> On 06/01/2017 13:38, Chris Wilson wrote:
>>> On Fri, Jan 06, 2017 at 01:32:32PM +0000, Tvrtko Ursulin wrote:
>>>>
>>>> On 05/01/2017 23:19, Chris Wilson wrote:
>>>>> Since the partial offset must be page aligned, we can use those low 12
>>>>> bits to encode the side of the partial view (which then cannot be larger
>>>>
>>>> s/side/size/
>>>>
>>>>> than 8MiB in pages).
>>>>>
>>>>> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
>>>>> ---
>>>>> drivers/gpu/drm/i915/i915_gem.c     | 22 +++++++++++++---------
>>>>> drivers/gpu/drm/i915/i915_gem_gtt.c |  4 ++--
>>>>> drivers/gpu/drm/i915/i915_gem_gtt.h | 25 +++++++++++++++++++++++--
>>>>> drivers/gpu/drm/i915/i915_vma.c     |  9 ++++-----
>>>>> 4 files changed, 42 insertions(+), 18 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
>>>>> index f6f4ec894a7f..2c1631190a60 100644
>>>>> --- a/drivers/gpu/drm/i915/i915_gem.c
>>>>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>>>>> @@ -1698,6 +1698,9 @@ static unsigned int tile_row_pages(struct drm_i915_gem_object *obj)
>>>>> {
>>>>> 	u64 size;
>>>>>
>>>>> +	BUILD_BUG_ON(ilog2(GEN7_FENCE_MAX_PITCH_VAL*128*32 >> PAGE_SHIFT) >
>>>>> +		     INTEL_PARTIAL_SIZE_BITS);
>>>>> +
>>>>> 	size = i915_gem_object_get_stride(obj);
>>>>> 	size *= i915_gem_object_get_tiling(obj) == I915_TILING_Y ? 32 : 8;
>>>>>
>>>>> @@ -1831,24 +1834,25 @@ int i915_gem_fault(struct vm_area_struct *area, struct vm_fault *vmf)
>>>>> 	vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0, flags);
>>>>> 	if (IS_ERR(vma)) {
>>>>> 		struct i915_ggtt_view view;
>>>>> -		unsigned int chunk_size;
>>>>> +		unsigned int chunk;
>>>>>
>>>>> 		/* Use a partial view if it is bigger than available space */
>>>>> -		chunk_size = MIN_CHUNK_PAGES;
>>>>> +		chunk = MIN_CHUNK_PAGES;
>>>>> 		if (i915_gem_object_is_tiled(obj))
>>>>> -			chunk_size = roundup(chunk_size, tile_row_pages(obj));
>>>>> +			chunk = roundup(chunk, tile_row_pages(obj));
>>>>>
>>>>> 		memset(&view, 0, sizeof(view));
>>>>> 		view.type = I915_GGTT_VIEW_PARTIAL;
>>>>> -		view.params.partial.offset = rounddown(page_offset, chunk_size);
>>>>> -		view.params.partial.size =
>>>>> -			min_t(unsigned int, chunk_size,
>>>>> -			      vma_pages(area) - view.params.partial.offset);
>>>>> +		view.params.partial.offset_size = rounddown(page_offset, chunk);
>>>>> +		view.params.partial.offset_size =
>>>>> +			(view.params.partial.offset_size << INTEL_PARTIAL_SIZE_BITS) |
>>>>> +			(min_t(unsigned int, chunk,
>>>>> +			      vma_pages(area) - view.params.partial.offset_size) - 1);
>>>>>
>>>>> 		/* If the partial covers the entire object, just create a
>>>>> 		 * normal VMA.
>>>>> 		 */
>>>>> -		if (chunk_size >= obj->base.size >> PAGE_SHIFT)
>>>>> +		if (chunk >= obj->base.size >> PAGE_SHIFT)
>>>>> 			view.type = I915_GGTT_VIEW_NORMAL;
>>>>>
>>>>> 		/* Userspace is now writing through an untracked VMA, abandon
>>>>> @@ -1878,7 +1882,7 @@ int i915_gem_fault(struct vm_area_struct *area, struct vm_fault *vmf)
>>>>>
>>>>> 	/* Finally, remap it using the new GTT offset */
>>>>> 	ret = remap_io_mapping(area,
>>>>> -			       area->vm_start + (vma->ggtt_view.params.partial.offset << PAGE_SHIFT),
>>>>> +			       area->vm_start + intel_partial_get_offset(&vma->ggtt_view.params.partial),
>>>>> 			       (ggtt->mappable_base + vma->node.start) >> PAGE_SHIFT,
>>>>> 			       min_t(u64, vma->size, area->vm_end - area->vm_start),
>>>>> 			       &ggtt->mappable);
>>>>> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
>>>>> index f698006fe883..4e77baf7d652 100644
>>>>> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
>>>>> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
>>>>> @@ -3492,7 +3492,7 @@ intel_partial_pages(const struct i915_ggtt_view *view,
>>>>> {
>>>>> 	struct sg_table *st;
>>>>> 	struct scatterlist *sg, *iter;
>>>>> -	unsigned int count = view->params.partial.size;
>>>>> +	unsigned int count = intel_partial_get_pages(&view->params.partial);
>>>>
>>>> Thinking if get_pages could be improved since we use it elsewhere so
>>>> much - get_size_pages ? Or just the existing get_size() >>
>>>> PAGE_SHIFT.
>>>
>>> _get_page_count() or get_npages().
>>
>> First then, but I am not sure we need both get_size and get_page_count.
>>
>>>>> 	unsigned int offset;
>>>>> 	int ret = -ENOMEM;
>>>>>
>>>>> @@ -3505,7 +3505,7 @@ intel_partial_pages(const struct i915_ggtt_view *view,
>>>>> 		goto err_sg_alloc;
>>>>>
>>>>> 	iter = i915_gem_object_get_sg(obj,
>>>>> -				      view->params.partial.offset,
>>>>> +				      intel_partial_get_page_offset(&view->params.partial),
>>>>> 				      &offset);
>>>>> 	GEM_BUG_ON(!iter);
>>>>>
>>>>> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
>>>>> index 19ea4c942df4..023bf6ac3dc7 100644
>>>>> --- a/drivers/gpu/drm/i915/i915_gem_gtt.h
>>>>> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
>>>>> @@ -156,10 +156,31 @@ struct intel_rotation_info {
>>>>> };
>>>>>
>>>>> struct intel_partial_info {
>>>>> -	u64 offset;
>>>>> -	unsigned int size;
>>>>> +	/* offset is page-aligned, leaving just enough bits for the size */
>>>>> +#define INTEL_PARTIAL_SIZE_BITS PAGE_SHIFT
>>>>> +	u64 offset_size;
>>>>> };
>>>>>
>>>>> +static inline u32 intel_partial_get_pages(const struct intel_partial_info *pi)
>>>>> +{
>>>>> +	return 1 + (pi->offset_size & GENMASK(INTEL_PARTIAL_SIZE_BITS-1, 0));
>>>>> +}
>>>>> +
>>>>> +static inline u32 intel_partial_get_size(const struct intel_partial_info *pi)
>>>>> +{
>>>>> +	return intel_partial_get_pages(pi) << PAGE_SHIFT;
>>>>> +}
>>>>> +
>>>>> +static inline u64 intel_partial_get_offset(const struct intel_partial_info *pi)
>>>>> +{
>>>>> +	return pi->offset_size & GENMASK(63, INTEL_PARTIAL_SIZE_BITS);
>>>>> +}
>>>>> +
>>>>> +static inline u64 intel_partial_get_page_offset(const struct intel_partial_info *pi)
>>>>> +{
>>>>> +	return pi->offset_size >> INTEL_PARTIAL_SIZE_BITS;
>>>>> +}
>>>>> +
>>>>> struct i915_ggtt_view {
>>>>> 	enum i915_ggtt_view_type type;
>>>>>
>>>>> diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
>>>>> index 58f2483362ad..65770b7109c0 100644
>>>>> --- a/drivers/gpu/drm/i915/i915_vma.c
>>>>> +++ b/drivers/gpu/drm/i915/i915_vma.c
>>>>> @@ -96,11 +96,10 @@ __i915_vma_create(struct drm_i915_gem_object *obj,
>>>>> 		vma->ggtt_view = *view;
>>>>> 		if (view->type == I915_GGTT_VIEW_PARTIAL) {
>>>>> 			GEM_BUG_ON(range_overflows_t(u64,
>>>>> -						     view->params.partial.offset,
>>>>> -						     view->params.partial.size,
>>>>> -						     obj->base.size >> PAGE_SHIFT));
>>>>> -			vma->size = view->params.partial.size;
>>>>> -			vma->size <<= PAGE_SHIFT;
>>>>> +						     intel_partial_get_offset(&view->params.partial),
>>>>> +						     intel_partial_get_size(&view->params.partial),
>>>>> +						     obj->base.size));
>>>>> +			vma->size = intel_partial_get_size(&view->params.partial);
>>>>> 			GEM_BUG_ON(vma->size >= obj->base.size);
>>>>> 		} else if (view->type == I915_GGTT_VIEW_ROTATED) {
>>>>> 			vma->size =
>>>>>
>>>>
>>>> I am not sure the complication is worth the small saving. Don't we
>>>> ever expect to start creating >8MiB partials? If one day we get an
>>>> efficient way of querying for the biggest hole we could start doing
>>>> that and then this would have to be undone.
>>>
>>> The current code works best with fixed size partials (fenceable chunks)
>>> as the issue with variable sized partials is the lookup cache efficiency.
>>
>> You mean the VMA rb tree? But fewer VMAs and fewer faults (larger
>> partials) surely must be better. 1MiB chinks was AFAIR just a
>> starting point.
>
> Depends. Bigger objects in the GGTT means more thrashing leaving PTEs
> unused between cycles. Remember we still do the try whole object first
> and only use partials if forced - which typically means we are in the
> contention scenario where objects are likely to be paged in and out of
> the GTT.

It depends on the usage pattern. Perhaps it is just one huge object 
which can't fit and then we do it in 1 MiB chunks while we could do it 
in bigger ones instead. Maybe it doesn't matter for such huge objects, 
in fact maybe it doesn't hugely matter for mmap_gtt at all. But then the 
question is why bother with saving a few bytes on the memcpy. And the 
ggtt view type == sizeof trickery is what brings the main saving anyway. 
(From memcpy on 32 bytes, to memcpy on 12 bytes. This patch only goes 
further from 12 to 8, even though it comes before it.)

> Improvements on the fault path should be primarily on shrinking the
> lock, ideally avoiding struct_mutex, adding a vm lock and fence lock
> (with more atomic pin counting),  to improve concurrency of faulters and
> avoid stalling others.

I agree with this as a priority, but wouldn't necessarily preclude using 
the largest hole as the partial size.

>>> The packing is to ensure we have no unused bits in the memcmp() inside
>>> i915_vma_compare.
>>
>> Do you think it makes a difference? You were initially reluctant to
>> let that patch where I save a function call in the lookup in
>> straight away AFAIR.
>
> Because you were optimising the wrong thing. The primary user of
> vma_compare is for ggtt lookups, execbuf needs a O(1) path from handle
> to vma (which only used the vma rbtree for its initial creation).

Well I saved a function call in a dominant path in the current code 
base. Don't see how is that a wrong thing to optimise.

> The fun with packing here is to not let i915_vma_compare() explode.
>
>> Btw, we are doing to much on trybot - it is bad for the history and
>> other people, well and ourselves actually. When digging for some
>> historical context we would need to search both lists now. I think
>> we need to try and limit this. Try to split the series in some
>> logical chunks and continue on the main list. Thoughts?
>
> The initial patches that we are discussing here are on the list, more
> recently https://patchwork.freedesktop.org/series/17337/
> Obviously that has now been jiggled about because of this discussion,
> but this set are unchanged.

Excuse me then for finding it difficult to navigate around. You've been 
posting a lot to both lists and changing the composition of the series. 
I am trying to get to the latest version of i915 selftest on which I 
started before the break but the road keeps stretching!

Regards,

Tvrtko