[Intel-gfx] [PATCH 2/2] drm/i915: Shrink the GEM kmem_caches upon idling

Wed Jan 24 11:06:36 UTC 2018

On 24/01/2018 10:37, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2018-01-24 10:32:16)
>>
>> On 19/01/2018 15:23, Chris Wilson wrote:
>>> When we finally decide the gpu is idle, that is a good time to shrink
>>> our kmem_caches.
>>>
>>> v3: Defer until an rcu grace period after we idle.
>>>
>>> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
>>> Cc: Tvrtko Ursulin <tvrtko.ursulin at linux.intel.com>
>>> ---
>>>    drivers/gpu/drm/i915/i915_gem.c | 65 +++++++++++++++++++++++++++++++++++++++++
>>>    1 file changed, 65 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
>>> index 7f0684ccc724..6a8fbcae835b 100644
>>> --- a/drivers/gpu/drm/i915/i915_gem.c
>>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>>> @@ -3341,12 +3341,59 @@ new_requests_since_last_retire(const struct drm_i915_private *i915)
>>>                work_pending(&i915->gt.idle_work.work));
>>>    }
>>>    
>>> +static void shrink_caches(struct drm_i915_private *i915)
>>> +{
>>> +     /*
>>> +      * kmem_cache_shrink() discards empty slabs and reorders partially
>>> +      * filled slabs to prioritise allocating from the mostly full slabs,
>>> +      * with the aim of reducing fragmentation.
>>> +      */
>>> +     kmem_cache_shrink(i915->priorities);
>>> +     kmem_cache_shrink(i915->dependencies);
>>> +     kmem_cache_shrink(i915->requests);
>>> +     kmem_cache_shrink(i915->luts);
>>> +     kmem_cache_shrink(i915->vmas);
>>> +     kmem_cache_shrink(i915->objects);
>>> +}
>>> +
>>> +struct sleep_rcu_work {
>>> +     struct drm_i915_private *i915;
>>> +     struct rcu_head rcu;
>>> +     struct work_struct work;
>>> +     u32 epoch;
>>> +};
>>> +
>>> +static void __sleep_work(struct work_struct *work)
>>> +{
>>> +     struct sleep_rcu_work *s = container_of(work, typeof(*s), work);
>>> +     struct drm_i915_private *i915 = s->i915;
>>> +     u32 epoch = s->epoch;
>>> +
>>> +     kfree(s);
>>> +     if (epoch == READ_ONCE(i915->gt.epoch))
>>> +             shrink_caches(i915);
>>> +}
>>> +
>>> +static void __sleep_rcu(struct rcu_head *rcu)
>>> +{
>>> +     struct sleep_rcu_work *s = container_of(rcu, typeof(*s), rcu);
>>> +     struct drm_i915_private *i915 = s->i915;
>>> +
>>> +     if (s->epoch == READ_ONCE(i915->gt.epoch)) {
>>> +             INIT_WORK(&s->work, __sleep_work);
>>> +             queue_work(i915->wq, &s->work);
>>> +     } else {
>>> +             kfree(s);
>>> +     }
>>> +}
>>> +
>>>    static void
>>>    i915_gem_idle_work_handler(struct work_struct *work)
>>>    {
>>>        struct drm_i915_private *dev_priv =
>>>                container_of(work, typeof(*dev_priv), gt.idle_work.work);
>>>        bool rearm_hangcheck;
>>> +     u32 epoch = 0;
>>>        ktime_t end;
>>>    
>>>        if (!READ_ONCE(dev_priv->gt.awake))
>>> @@ -3406,6 +3453,7 @@ i915_gem_idle_work_handler(struct work_struct *work)
>>>        GEM_BUG_ON(!dev_priv->gt.awake);
>>>        dev_priv->gt.awake = false;
>>>        rearm_hangcheck = false;
>>> +     epoch = dev_priv->gt.epoch;
>>>    
>>>        if (INTEL_GEN(dev_priv) >= 6)
>>>                gen6_rps_idle(dev_priv);
>>> @@ -3421,6 +3469,23 @@ i915_gem_idle_work_handler(struct work_struct *work)
>>>                GEM_BUG_ON(!dev_priv->gt.awake);
>>>                i915_queue_hangcheck(dev_priv);
>>>        }
>>> +
>>> +     /*
>>> +      * When we are idle, it is an opportune time to reap our caches.
>>> +      * However, we have many objects that utilise RCU and the ordered
>>> +      * i915->wq that this work is executing on. To try and flush any
>>> +      * pending frees now we are idle, we first wait for an RCU grace
>>> +      * period, and then queue a task (that will run last on the wq) to
>>> +      * shrink and re-optimize the caches.
>>> +      */
>>> +     if (epoch == READ_ONCE(dev_priv->gt.epoch)) {
>>
>> Theoretically this can be true on epoch wrap-around, when trylock
>> failed. It's one in four billion busy transitions but it could be just
>> worth handling it explicitly. Simplest probably to ensure gt.epoch is
>> never zero when incrementing?
> 
> I was working on the principle that a u32 wraparound takes at least
> 100ms * 2^32, or about 326 years. :)

Ah forgot there is a time delay.. okay then. :)

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin at intel.com>

To be kept if/when you change to unsigned int.

Regards,

Tvrtko