[Intel-gfx] [PATCH 23/34] drm/i915: Share per-timeline HWSP using a slab suballocator

Tue Jan 22 11:33:03 UTC 2019

On 22/01/2019 11:12, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2019-01-22 10:47:11)
>>
>> On 21/01/2019 22:21, Chris Wilson wrote:
>>> If we restrict ourselves to only using a cacheline for each timeline's
>>> HWSP (we could go smaller, but want to avoid needless polluting
>>> cachelines on different engines between different contexts), then we can
>>> suballocate a single 4k page into 64 different timeline HWSP. By
>>> treating each fresh allocation as a slab of 64 entries, we can keep it
>>> around for the next 64 allocation attempts until we need to refresh the
>>> slab cache.
>>>
>>> John Harrison noted the issue of fragmentation leading to the same worst
>>> case performance of one page per timeline as before, which can be
>>> mitigated by adopting a freelist.
>>>
>>> v2: Keep all partially allocated HWSP on a freelist
>>>
>>> This is still without migration, so it is possible for the system to end
>>> up with each timeline in its own page, but we ensure that no new
>>> allocation would needless allocate a fresh page!
>>>
>>> v3: Throw a selftest at the allocator to try and catch invalid cacheline
>>> reuse.
>>>
>>> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
>>> Cc: John Harrison <John.C.Harrison at Intel.com>
>>> ---
>>>    drivers/gpu/drm/i915/i915_drv.h               |   4 +
>>>    drivers/gpu/drm/i915/i915_timeline.c          | 117 ++++++++++++---
>>>    drivers/gpu/drm/i915/i915_timeline.h          |   1 +
>>>    drivers/gpu/drm/i915/i915_vma.h               |  12 ++
>>>    drivers/gpu/drm/i915/selftests/i915_random.c  |  33 ++++-
>>>    drivers/gpu/drm/i915/selftests/i915_random.h  |   3 +
>>>    .../gpu/drm/i915/selftests/i915_timeline.c    | 140 ++++++++++++++++++
>>>    7 files changed, 282 insertions(+), 28 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>>> index 364067f811f7..c00eaf2889fb 100644
>>> --- a/drivers/gpu/drm/i915/i915_drv.h
>>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>>> @@ -1978,6 +1978,10 @@ struct drm_i915_private {
>>>                struct i915_gt_timelines {
>>>                        struct mutex mutex; /* protects list, tainted by GPU */
>>>                        struct list_head list;
>>> +
>>> +                     /* Pack multiple timelines' seqnos into the same page */
>>> +                     spinlock_t hwsp_lock;
>>> +                     struct list_head hwsp_free_list;
>>>                } timelines;
>>>    
>>>                struct list_head active_rings;
>>> diff --git a/drivers/gpu/drm/i915/i915_timeline.c b/drivers/gpu/drm/i915/i915_timeline.c
>>> index 8d5792311a8f..69ee33dfa340 100644
>>> --- a/drivers/gpu/drm/i915/i915_timeline.c
>>> +++ b/drivers/gpu/drm/i915/i915_timeline.c
>>> @@ -9,6 +9,12 @@
>>>    #include "i915_timeline.h"
>>>    #include "i915_syncmap.h"
>>>    
>>> +struct i915_timeline_hwsp {
>>> +     struct i915_vma *vma;
>>> +     struct list_head free_link;
>>> +     u64 free_bitmap;
>>> +};
>>> +
>>>    static struct i915_vma *__hwsp_alloc(struct drm_i915_private *i915)
>>>    {
>>>        struct drm_i915_gem_object *obj;
>>> @@ -27,28 +33,92 @@ static struct i915_vma *__hwsp_alloc(struct drm_i915_private *i915)
>>>        return vma;
>>>    }
>>>    
>>> -static int hwsp_alloc(struct i915_timeline *timeline)
>>> +static struct i915_vma *
>>> +hwsp_alloc(struct i915_timeline *timeline, int *offset)
>>>    {
>>> -     struct i915_vma *vma;
>>> +     struct drm_i915_private *i915 = timeline->i915;
>>> +     struct i915_gt_timelines *gt = &i915->gt.timelines;
>>> +     struct i915_timeline_hwsp *hwsp;
>>> +     int cacheline;
>>>    
>>> -     vma = __hwsp_alloc(timeline->i915);
>>> -     if (IS_ERR(vma))
>>> -             return PTR_ERR(vma);
>>> +     BUILD_BUG_ON(BITS_PER_TYPE(u64) * CACHELINE_BYTES > PAGE_SIZE);
>>>    
>>> -     timeline->hwsp_ggtt = vma;
>>> -     timeline->hwsp_offset = 0;
>>> +     spin_lock(&gt->hwsp_lock);
>>>    
>>> -     return 0;
>>> +     /* hwsp_free_list only contains HWSP that have available cachelines */
>>> +     hwsp = list_first_entry_or_null(&gt->hwsp_free_list,
>>> +                                     typeof(*hwsp), free_link);
>>> +     if (!hwsp) {
>>> +             struct i915_vma *vma;
>>> +
>>> +             spin_unlock(&gt->hwsp_lock);
>>> +
>>> +             hwsp = kmalloc(sizeof(*hwsp), GFP_KERNEL);
>>> +             if (!hwsp)
>>> +                     return ERR_PTR(-ENOMEM);
>>> +
>>> +             vma = __hwsp_alloc(i915);
>>> +             if (IS_ERR(vma)) {
>>> +                     kfree(hwsp);
>>> +                     return vma;
>>> +             }
>>> +
>>> +             vma->private = hwsp;
>>> +             hwsp->vma = vma;
>>> +             hwsp->free_bitmap = ~0ull;
>>> +
>>> +             spin_lock(&gt->hwsp_lock);
>>> +             list_add(&hwsp->free_link, &gt->hwsp_free_list);
>>> +     }
>>> +
>>> +     GEM_BUG_ON(!hwsp->free_bitmap);
>>> +     cacheline = __ffs64(hwsp->free_bitmap);
>>> +     hwsp->free_bitmap &= ~BIT_ULL(cacheline);
>>> +     if (!hwsp->free_bitmap)
>>> +             list_del(&hwsp->free_link);
>>> +
>>> +     spin_unlock(&gt->hwsp_lock);
>>> +
>>> +     GEM_BUG_ON(hwsp->vma->private != hwsp);
>>> +
>>> +     *offset = cacheline * CACHELINE_BYTES;
>>> +     return hwsp->vma;
>>> +}
>>> +
>>> +static void hwsp_free(struct i915_timeline *timeline)
>>> +{
>>> +     struct i915_gt_timelines *gt = &timeline->i915->gt.timelines;
>>> +     struct i915_timeline_hwsp *hwsp;
>>> +
>>> +     hwsp = i915_timeline_hwsp(timeline);
>>> +     if (!hwsp) /* leave global HWSP alone! */
>>
>> Later you add i915_timeline_is_global so could use it here.
> 
> Difference is that we want the hwsp for use in this function?
> 
> if (i915_timeline_is_global(timeline))
> 	return;
> 
> hwsp = i915_timeline_hwsp(timeline);
> 
> I suppose has the feeling of being more descriptive and the compiler
> should be smart enough to dtrt.

I missed that so I think it is best as it is. Existing comment is enough 
to make it clear.

>>
>>> +             return;
>>> +
>>> +     spin_lock(&gt->hwsp_lock);
>>> +
>>> +     /* As a cacheline becomes available, publish the HWSP on the freelist */
>>
>> Thank you! :)
>>
>>> +     if (!hwsp->free_bitmap)
>>> +             list_add_tail(&hwsp->free_link, &gt->hwsp_free_list);
>>> +
>>> +     hwsp->free_bitmap |= BIT_ULL(timeline->hwsp_offset / CACHELINE_BYTES);
>>> +
>>> +     /* And if no one is left using it, give the page back to the system */
>>> +     if (hwsp->free_bitmap == ~0ull) {
>>> +             i915_vma_put(hwsp->vma);
>>> +             list_del(&hwsp->free_link);
>>> +             kfree(hwsp);
>>> +     }
>>> +
>>> +     spin_unlock(&gt->hwsp_lock);
>>>    }
>>>    
>>>    int i915_timeline_init(struct drm_i915_private *i915,
>>>                       struct i915_timeline *timeline,
>>>                       const char *name,
>>> -                    struct i915_vma *global_hwsp)
>>> +                    struct i915_vma *hwsp)
>>>    {
>>>        struct i915_gt_timelines *gt = &i915->gt.timelines;
>>>        void *vaddr;
>>> -     int err;
>>>    
>>>        /*
>>>         * Ideally we want a set of engines on a single leaf as we expect
>>> @@ -64,18 +134,18 @@ int i915_timeline_init(struct drm_i915_private *i915,
>>>        timeline->name = name;
>>>        timeline->pin_count = 0;
>>>    
>>> -     if (global_hwsp) {
>>> -             timeline->hwsp_ggtt = i915_vma_get(global_hwsp);
>>> -             timeline->hwsp_offset = I915_GEM_HWS_SEQNO_ADDR;
>>> -     } else {
>>> -             err = hwsp_alloc(timeline);
>>> -             if (err)
>>> -                     return err;
>>> +     timeline->hwsp_offset = I915_GEM_HWS_SEQNO_ADDR;
>>
>> Could be clearer to put this on the else branch.
> 
> Hey! I rewrote this because I thought it was tidier without the else and
> repeated i915_vma_get() :)
> 
>>> -void i915_random_reorder(unsigned int *order, unsigned int count,
>>> -                      struct rnd_state *state)
>>> +void i915_prandom_shuffle(void *arr, size_t elsz, size_t count,
>>> +                       struct rnd_state *state)
>>>    {
>>> -     unsigned int i, j;
>>> +     char stack[128];
>>> +
>>> +     if (WARN_ON(elsz > sizeof(stack) || count > U32_MAX))
>>
>> I wonder if the elsz > sizeof(stack) would work as a BUILD_BUG_ON if
>> elsz was marked as const. Seems to be const at both call sites.
> 
> BUILD_BUG_ON is cpp-ish, so it needs to provably constant, so in the same
> translation unit.
> 
>> Step further sizing the stack by it, but.. that's some GCC extension we
>> should not use right?
> 
> Right. No char stack[elsz] for us. Alternatively is to make this into a
> macro-builder and have a custom shuffle for every type. For selftesting,
> just not worth it. (Until we have an example were we are severely
> limited in our testing by how long it takes to shuffle numbers... But we
> can probably just change any test to shuffle less.)
> 
>>> +static int __mock_hwsp_timeline(struct mock_hwsp_freelist *state,
>>> +                             unsigned int count,
>>> +                             unsigned int flags)
>>> +{
>>> +     struct i915_timeline *tl;
>>> +     unsigned int idx;
>>> +
>>> +     while (count--) {
>>> +             unsigned long cacheline;
>>> +             int err;
>>> +
>>> +             tl = i915_timeline_create(state->i915, "mock", NULL);
>>> +             if (IS_ERR(tl))
>>> +                     return PTR_ERR(tl);
>>> +
>>> +             cacheline = hwsp_cacheline(tl);
>>> +             err = radix_tree_insert(&state->cachelines, cacheline, tl);
>>> +             if (err) {
>>> +                     if (err == -EEXIST) {
>>> +                             pr_err("HWSP cacheline %lu already used; duplicate allocation!\n",
>>> +                                    cacheline);
>>> +                     }
>>
>> Radix tree is just to lookup potential offset duplicates? I mean, to
>> avoid doing a linear search on the history array? If so is it worth it
>> since there aren't that many timelines used in test? Could be since
>> below I figured the maximum number of timelines test will use...
> 
> Radix tree because that's where I started with the test: I want to
> detect duplicate cachelines.
> 
>>> +                     i915_timeline_put(tl);
>>> +                     return err;
>>> +             }
>>> +
>>> +             idx = state->count++ % state->max;
>>> +             __mock_hwsp_record(state, idx, tl);
>>
>> This doesn't hit "recycling" of slots right? Max count is 2 *
>> CACHELINES_PER_PAGE = 128 while state->max is 512.
> 
> The history is kept over the different phases. We add a block, remove a
> smaller block; repeat. My thinking was that would create a more
> interesting freelist pattern.

Yeah, I missed the state is kept. I think is is okay then.

>>> +     }
>>> +
>>> +     if (flags & SHUFFLE)
>>> +             i915_prandom_shuffle(state->history,
>>> +                                  sizeof(*state->history),
>>> +                                  min(state->count, state->max),
>>> +                                  &state->prng);
>>> +
>>> +     count = i915_prandom_u32_max_state(min(state->count, state->max),
>>> +                                        &state->prng);
>>> +     while (count--) {
>>> +             idx = --state->count % state->max;
>>> +             __mock_hwsp_record(state, idx, NULL);
>>> +     }
>>
>> There is no allocations after shuffling, so the code path of allocating
>> from hwsp free list will not be exercised I think.
> 
> We keep history from one iteration to the next. I think if we added a loop
> here (____mock_hwsp_timeline!) you would be happier overall.

Loop to allocate a random number of entries? Yeah that would make a pass 
more self contained. But optional..

> 
>>> +     return 0;
>>> +}
>>> +
>>> +static int mock_hwsp_freelist(void *arg)
>>> +{
>>> +     struct mock_hwsp_freelist state;
>>> +     const struct {
>>> +             const char *name;
>>> +             unsigned int flags;
>>> +     } phases[] = {
>>> +             { "linear", 0 },
>>> +             { "shuffled", SHUFFLE },
>>> +             { },
>>> +     }, *p;
>>> +     unsigned int na;
>>> +     int err = 0;
>>> +
>>> +     INIT_RADIX_TREE(&state.cachelines, GFP_KERNEL);
>>> +     state.prng = I915_RND_STATE_INITIALIZER(i915_selftest.random_seed);
>>> +
>>> +     state.i915 = mock_gem_device();
>>> +     if (!state.i915)
>>> +             return -ENOMEM;
>>> +
>>> +     /*
>>> +      * Create a bunch of timelines and check that their HWSP do not overlap.
>>> +      * Free some, and try again.
>>> +      */
>>> +
>>> +     state.max = PAGE_SIZE / sizeof(*state.history);
>>
>> So maximum live number of timelines is 512 on 64-bit which should
>> translate to 8 max pages of hwsp backing store.
> 
> Do you feel like we need more or less? I think that's a reasonable
> number, it means we should have some pages eventually on the freelist.
> And not so many that reuse is sporadic (although the allocator is geared
> towards keeping the same page for reuse until full).

I think it is fine, was just talking out loud as I was figuring out what 
the test is doing.

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>

Regards,

Tvrtko