[Intel-gfx] [PATCH] drm/i915: Ratelimit i915_globals_park
Tvrtko Ursulin
tvrtko.ursulin at linux.intel.com
Wed Dec 18 12:49:26 UTC 2019
On 18/12/2019 09:40, Chris Wilson wrote:
> When doing our global park, we like to be a good citizen and shrink our
> slab caches (of which we have quite a few now), but each
> kmem_cache_shrink() incurs a stop_machine() and so ends up being quite
> expensive, causing machine-wide stalls. While ideally we would like to
> throw away unused pages in our slab caches whenever it appears that we
> are idling, doing so will require a much cheaper mechanism. In the
> meantime use a delayed worked to impose a rate-limit that means we have
> to have been idle for more than 2 seconds before we start shrinking.
I was thinking about this yesterday, while looking at rapid park-unpark
cycles!
> References: https://gitlab.freedesktop.org/drm/intel/issues/848
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin at intel.com>
> ---
> drivers/gpu/drm/i915/i915_globals.c | 53 ++++++++++++++++++++++++-----
> 1 file changed, 44 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_globals.c b/drivers/gpu/drm/i915/i915_globals.c
> index be127cd28931..3aa213684293 100644
> --- a/drivers/gpu/drm/i915/i915_globals.c
> +++ b/drivers/gpu/drm/i915/i915_globals.c
> @@ -20,7 +20,10 @@ static LIST_HEAD(globals);
> static atomic_t active;
> static atomic_t epoch;
> static struct park_work {
> - struct rcu_work work;
> + struct delayed_work work;
> + struct rcu_head rcu;
> + unsigned long flags;
> +#define PENDING 0
> int epoch;
> } park;
>
> @@ -37,11 +40,33 @@ static void i915_globals_shrink(void)
> global->shrink();
> }
>
> +static void __i915_globals_grace(struct rcu_head *rcu)
> +{
> + /* Ratelimit parking as shrinking is quite slow */
> + schedule_delayed_work(&park.work, round_jiffies_up_relative(2 * HZ));
> +}
> +
> +static void __i915_globals_queue_rcu(void)
> +{
> + park.epoch = atomic_inc_return(&epoch);
> + if (!atomic_read(&active)) {
> + init_rcu_head(&park.rcu);
Do we need to do init/destroy more than once? I think once on driver
load/exit would be more correct since the head is statically allocated.
Otherwise looks correct to me.
Regards,
Tvrtko
> + call_rcu(&park.rcu, __i915_globals_grace);
> + }
> +}
> +
> static void __i915_globals_park(struct work_struct *work)
> {
> + destroy_rcu_head(&park.rcu);
> +
> /* Confirm nothing woke up in the last grace period */
> - if (park.epoch == atomic_read(&epoch))
> - i915_globals_shrink();
> + if (park.epoch != atomic_read(&epoch)) {
> + __i915_globals_queue_rcu();
> + return;
> + }
> +
> + clear_bit(PENDING, &park.flags);
> + i915_globals_shrink();
> }
>
> void __init i915_global_register(struct i915_global *global)
> @@ -85,7 +110,7 @@ int __init i915_globals_init(void)
> }
> }
>
> - INIT_RCU_WORK(&park.work, __i915_globals_park);
> + INIT_DELAYED_WORK(&park.work, __i915_globals_park);
> return 0;
> }
>
> @@ -103,8 +128,9 @@ void i915_globals_park(void)
> if (!atomic_dec_and_test(&active))
> return;
>
> - park.epoch = atomic_inc_return(&epoch);
> - queue_rcu_work(system_wq, &park.work);
> + /* Queue cleanup after the next RCU grace period has freed slabs */
> + if (!test_and_set_bit(PENDING, &park.flags))
> + __i915_globals_queue_rcu();
> }
>
> void i915_globals_unpark(void)
> @@ -113,12 +139,21 @@ void i915_globals_unpark(void)
> atomic_inc(&active);
> }
>
> +static void __exit __i915_globals_flush(void)
> +{
> + atomic_inc(&active); /* skip shrinking */
> +
> + rcu_barrier(); /* wait for the work to be queued */
> + flush_delayed_work(&park.work);
> +
> + atomic_dec(&active);
> +}
> +
> void __exit i915_globals_exit(void)
> {
> - /* Flush any residual park_work */
> - atomic_inc(&epoch);
> - flush_rcu_work(&park.work);
> + GEM_BUG_ON(atomic_read(&active));
>
> + __i915_globals_flush();
> __i915_globals_cleanup();
>
> /* And ensure that our DESTROY_BY_RCU slabs are truly destroyed */
>
More information about the Intel-gfx
mailing list