[Intel-gfx] [PATCH 23/34] drm/i915: Share per-timeline HWSP using a slab suballocator
Tvrtko Ursulin
tvrtko.ursulin at linux.intel.com
Tue Jan 22 11:33:03 UTC 2019
On 22/01/2019 11:12, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2019-01-22 10:47:11)
>>
>> On 21/01/2019 22:21, Chris Wilson wrote:
>>> If we restrict ourselves to only using a cacheline for each timeline's
>>> HWSP (we could go smaller, but want to avoid needless polluting
>>> cachelines on different engines between different contexts), then we can
>>> suballocate a single 4k page into 64 different timeline HWSP. By
>>> treating each fresh allocation as a slab of 64 entries, we can keep it
>>> around for the next 64 allocation attempts until we need to refresh the
>>> slab cache.
>>>
>>> John Harrison noted the issue of fragmentation leading to the same worst
>>> case performance of one page per timeline as before, which can be
>>> mitigated by adopting a freelist.
>>>
>>> v2: Keep all partially allocated HWSP on a freelist
>>>
>>> This is still without migration, so it is possible for the system to end
>>> up with each timeline in its own page, but we ensure that no new
>>> allocation would needless allocate a fresh page!
>>>
>>> v3: Throw a selftest at the allocator to try and catch invalid cacheline
>>> reuse.
>>>
>>> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
>>> Cc: John Harrison <John.C.Harrison at Intel.com>
>>> ---
>>> drivers/gpu/drm/i915/i915_drv.h | 4 +
>>> drivers/gpu/drm/i915/i915_timeline.c | 117 ++++++++++++---
>>> drivers/gpu/drm/i915/i915_timeline.h | 1 +
>>> drivers/gpu/drm/i915/i915_vma.h | 12 ++
>>> drivers/gpu/drm/i915/selftests/i915_random.c | 33 ++++-
>>> drivers/gpu/drm/i915/selftests/i915_random.h | 3 +
>>> .../gpu/drm/i915/selftests/i915_timeline.c | 140 ++++++++++++++++++
>>> 7 files changed, 282 insertions(+), 28 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>>> index 364067f811f7..c00eaf2889fb 100644
>>> --- a/drivers/gpu/drm/i915/i915_drv.h
>>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>>> @@ -1978,6 +1978,10 @@ struct drm_i915_private {
>>> struct i915_gt_timelines {
>>> struct mutex mutex; /* protects list, tainted by GPU */
>>> struct list_head list;
>>> +
>>> + /* Pack multiple timelines' seqnos into the same page */
>>> + spinlock_t hwsp_lock;
>>> + struct list_head hwsp_free_list;
>>> } timelines;
>>>
>>> struct list_head active_rings;
>>> diff --git a/drivers/gpu/drm/i915/i915_timeline.c b/drivers/gpu/drm/i915/i915_timeline.c
>>> index 8d5792311a8f..69ee33dfa340 100644
>>> --- a/drivers/gpu/drm/i915/i915_timeline.c
>>> +++ b/drivers/gpu/drm/i915/i915_timeline.c
>>> @@ -9,6 +9,12 @@
>>> #include "i915_timeline.h"
>>> #include "i915_syncmap.h"
>>>
>>> +struct i915_timeline_hwsp {
>>> + struct i915_vma *vma;
>>> + struct list_head free_link;
>>> + u64 free_bitmap;
>>> +};
>>> +
>>> static struct i915_vma *__hwsp_alloc(struct drm_i915_private *i915)
>>> {
>>> struct drm_i915_gem_object *obj;
>>> @@ -27,28 +33,92 @@ static struct i915_vma *__hwsp_alloc(struct drm_i915_private *i915)
>>> return vma;
>>> }
>>>
>>> -static int hwsp_alloc(struct i915_timeline *timeline)
>>> +static struct i915_vma *
>>> +hwsp_alloc(struct i915_timeline *timeline, int *offset)
>>> {
>>> - struct i915_vma *vma;
>>> + struct drm_i915_private *i915 = timeline->i915;
>>> + struct i915_gt_timelines *gt = &i915->gt.timelines;
>>> + struct i915_timeline_hwsp *hwsp;
>>> + int cacheline;
>>>
>>> - vma = __hwsp_alloc(timeline->i915);
>>> - if (IS_ERR(vma))
>>> - return PTR_ERR(vma);
>>> + BUILD_BUG_ON(BITS_PER_TYPE(u64) * CACHELINE_BYTES > PAGE_SIZE);
>>>
>>> - timeline->hwsp_ggtt = vma;
>>> - timeline->hwsp_offset = 0;
>>> + spin_lock(>->hwsp_lock);
>>>
>>> - return 0;
>>> + /* hwsp_free_list only contains HWSP that have available cachelines */
>>> + hwsp = list_first_entry_or_null(>->hwsp_free_list,
>>> + typeof(*hwsp), free_link);
>>> + if (!hwsp) {
>>> + struct i915_vma *vma;
>>> +
>>> + spin_unlock(>->hwsp_lock);
>>> +
>>> + hwsp = kmalloc(sizeof(*hwsp), GFP_KERNEL);
>>> + if (!hwsp)
>>> + return ERR_PTR(-ENOMEM);
>>> +
>>> + vma = __hwsp_alloc(i915);
>>> + if (IS_ERR(vma)) {
>>> + kfree(hwsp);
>>> + return vma;
>>> + }
>>> +
>>> + vma->private = hwsp;
>>> + hwsp->vma = vma;
>>> + hwsp->free_bitmap = ~0ull;
>>> +
>>> + spin_lock(>->hwsp_lock);
>>> + list_add(&hwsp->free_link, >->hwsp_free_list);
>>> + }
>>> +
>>> + GEM_BUG_ON(!hwsp->free_bitmap);
>>> + cacheline = __ffs64(hwsp->free_bitmap);
>>> + hwsp->free_bitmap &= ~BIT_ULL(cacheline);
>>> + if (!hwsp->free_bitmap)
>>> + list_del(&hwsp->free_link);
>>> +
>>> + spin_unlock(>->hwsp_lock);
>>> +
>>> + GEM_BUG_ON(hwsp->vma->private != hwsp);
>>> +
>>> + *offset = cacheline * CACHELINE_BYTES;
>>> + return hwsp->vma;
>>> +}
>>> +
>>> +static void hwsp_free(struct i915_timeline *timeline)
>>> +{
>>> + struct i915_gt_timelines *gt = &timeline->i915->gt.timelines;
>>> + struct i915_timeline_hwsp *hwsp;
>>> +
>>> + hwsp = i915_timeline_hwsp(timeline);
>>> + if (!hwsp) /* leave global HWSP alone! */
>>
>> Later you add i915_timeline_is_global so could use it here.
>
> Difference is that we want the hwsp for use in this function?
>
> if (i915_timeline_is_global(timeline))
> return;
>
> hwsp = i915_timeline_hwsp(timeline);
>
> I suppose has the feeling of being more descriptive and the compiler
> should be smart enough to dtrt.
I missed that so I think it is best as it is. Existing comment is enough
to make it clear.
>>
>>> + return;
>>> +
>>> + spin_lock(>->hwsp_lock);
>>> +
>>> + /* As a cacheline becomes available, publish the HWSP on the freelist */
>>
>> Thank you! :)
>>
>>> + if (!hwsp->free_bitmap)
>>> + list_add_tail(&hwsp->free_link, >->hwsp_free_list);
>>> +
>>> + hwsp->free_bitmap |= BIT_ULL(timeline->hwsp_offset / CACHELINE_BYTES);
>>> +
>>> + /* And if no one is left using it, give the page back to the system */
>>> + if (hwsp->free_bitmap == ~0ull) {
>>> + i915_vma_put(hwsp->vma);
>>> + list_del(&hwsp->free_link);
>>> + kfree(hwsp);
>>> + }
>>> +
>>> + spin_unlock(>->hwsp_lock);
>>> }
>>>
>>> int i915_timeline_init(struct drm_i915_private *i915,
>>> struct i915_timeline *timeline,
>>> const char *name,
>>> - struct i915_vma *global_hwsp)
>>> + struct i915_vma *hwsp)
>>> {
>>> struct i915_gt_timelines *gt = &i915->gt.timelines;
>>> void *vaddr;
>>> - int err;
>>>
>>> /*
>>> * Ideally we want a set of engines on a single leaf as we expect
>>> @@ -64,18 +134,18 @@ int i915_timeline_init(struct drm_i915_private *i915,
>>> timeline->name = name;
>>> timeline->pin_count = 0;
>>>
>>> - if (global_hwsp) {
>>> - timeline->hwsp_ggtt = i915_vma_get(global_hwsp);
>>> - timeline->hwsp_offset = I915_GEM_HWS_SEQNO_ADDR;
>>> - } else {
>>> - err = hwsp_alloc(timeline);
>>> - if (err)
>>> - return err;
>>> + timeline->hwsp_offset = I915_GEM_HWS_SEQNO_ADDR;
>>
>> Could be clearer to put this on the else branch.
>
> Hey! I rewrote this because I thought it was tidier without the else and
> repeated i915_vma_get() :)
>
>>> -void i915_random_reorder(unsigned int *order, unsigned int count,
>>> - struct rnd_state *state)
>>> +void i915_prandom_shuffle(void *arr, size_t elsz, size_t count,
>>> + struct rnd_state *state)
>>> {
>>> - unsigned int i, j;
>>> + char stack[128];
>>> +
>>> + if (WARN_ON(elsz > sizeof(stack) || count > U32_MAX))
>>
>> I wonder if the elsz > sizeof(stack) would work as a BUILD_BUG_ON if
>> elsz was marked as const. Seems to be const at both call sites.
>
> BUILD_BUG_ON is cpp-ish, so it needs to provably constant, so in the same
> translation unit.
>
>> Step further sizing the stack by it, but.. that's some GCC extension we
>> should not use right?
>
> Right. No char stack[elsz] for us. Alternatively is to make this into a
> macro-builder and have a custom shuffle for every type. For selftesting,
> just not worth it. (Until we have an example were we are severely
> limited in our testing by how long it takes to shuffle numbers... But we
> can probably just change any test to shuffle less.)
>
>>> +static int __mock_hwsp_timeline(struct mock_hwsp_freelist *state,
>>> + unsigned int count,
>>> + unsigned int flags)
>>> +{
>>> + struct i915_timeline *tl;
>>> + unsigned int idx;
>>> +
>>> + while (count--) {
>>> + unsigned long cacheline;
>>> + int err;
>>> +
>>> + tl = i915_timeline_create(state->i915, "mock", NULL);
>>> + if (IS_ERR(tl))
>>> + return PTR_ERR(tl);
>>> +
>>> + cacheline = hwsp_cacheline(tl);
>>> + err = radix_tree_insert(&state->cachelines, cacheline, tl);
>>> + if (err) {
>>> + if (err == -EEXIST) {
>>> + pr_err("HWSP cacheline %lu already used; duplicate allocation!\n",
>>> + cacheline);
>>> + }
>>
>> Radix tree is just to lookup potential offset duplicates? I mean, to
>> avoid doing a linear search on the history array? If so is it worth it
>> since there aren't that many timelines used in test? Could be since
>> below I figured the maximum number of timelines test will use...
>
> Radix tree because that's where I started with the test: I want to
> detect duplicate cachelines.
>
>>> + i915_timeline_put(tl);
>>> + return err;
>>> + }
>>> +
>>> + idx = state->count++ % state->max;
>>> + __mock_hwsp_record(state, idx, tl);
>>
>> This doesn't hit "recycling" of slots right? Max count is 2 *
>> CACHELINES_PER_PAGE = 128 while state->max is 512.
>
> The history is kept over the different phases. We add a block, remove a
> smaller block; repeat. My thinking was that would create a more
> interesting freelist pattern.
Yeah, I missed the state is kept. I think is is okay then.
>>> + }
>>> +
>>> + if (flags & SHUFFLE)
>>> + i915_prandom_shuffle(state->history,
>>> + sizeof(*state->history),
>>> + min(state->count, state->max),
>>> + &state->prng);
>>> +
>>> + count = i915_prandom_u32_max_state(min(state->count, state->max),
>>> + &state->prng);
>>> + while (count--) {
>>> + idx = --state->count % state->max;
>>> + __mock_hwsp_record(state, idx, NULL);
>>> + }
>>
>> There is no allocations after shuffling, so the code path of allocating
>> from hwsp free list will not be exercised I think.
>
> We keep history from one iteration to the next. I think if we added a loop
> here (____mock_hwsp_timeline!) you would be happier overall.
Loop to allocate a random number of entries? Yeah that would make a pass
more self contained. But optional..
>
>>> + return 0;
>>> +}
>>> +
>>> +static int mock_hwsp_freelist(void *arg)
>>> +{
>>> + struct mock_hwsp_freelist state;
>>> + const struct {
>>> + const char *name;
>>> + unsigned int flags;
>>> + } phases[] = {
>>> + { "linear", 0 },
>>> + { "shuffled", SHUFFLE },
>>> + { },
>>> + }, *p;
>>> + unsigned int na;
>>> + int err = 0;
>>> +
>>> + INIT_RADIX_TREE(&state.cachelines, GFP_KERNEL);
>>> + state.prng = I915_RND_STATE_INITIALIZER(i915_selftest.random_seed);
>>> +
>>> + state.i915 = mock_gem_device();
>>> + if (!state.i915)
>>> + return -ENOMEM;
>>> +
>>> + /*
>>> + * Create a bunch of timelines and check that their HWSP do not overlap.
>>> + * Free some, and try again.
>>> + */
>>> +
>>> + state.max = PAGE_SIZE / sizeof(*state.history);
>>
>> So maximum live number of timelines is 512 on 64-bit which should
>> translate to 8 max pages of hwsp backing store.
>
> Do you feel like we need more or less? I think that's a reasonable
> number, it means we should have some pages eventually on the freelist.
> And not so many that reuse is sporadic (although the allocator is geared
> towards keeping the same page for reuse until full).
I think it is fine, was just talking out loud as I was figuring out what
the test is doing.
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
Regards,
Tvrtko
More information about the Intel-gfx
mailing list