[Intel-gfx] [PATCH 5/5] drm/i915: Implement an oom-notifier for last resort shrinking

Daniel Vetter daniel at ffwll.ch
Tue May 20 10:38:05 CEST 2014


On Tue, May 20, 2014 at 08:28:43AM +0100, Chris Wilson wrote:
> Before the process killer is invoked, oom-notifiers are executed for one
> last try at recovering pages. We can hook into this callback to be sure
> that everything that can be is purged from our page lists, and to give a
> summary of how much memory is still pinned by the GPU in the case of an
> oom.
"This should be really valuable for debugging OOM issues."

I think we should put more emphasis on the shrink_all change. Maybe:

"Note that the last-ditch effort call to shrink_all we've previously
called from our normal shrinker when we could free as much as the vm
demaned is moved into the oom notifier. Since the shrinker accounting
races against bind/unbind operations we might have called shrink_all
prematurely, which this approach with an oom notifier avoids."

And one bikeshed below.
-Daniel
> 
> References: https://bugs.freedesktop.org/show_bug.cgi?id=72742
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Tested-by: lu hua <huax.lu at intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.h |  1 +
>  drivers/gpu/drm/i915/i915_gem.c | 74 +++++++++++++++++++++++++++++++++++++++--
>  2 files changed, 72 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index e69cb51de738..389204d44431 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1057,6 +1057,7 @@ struct i915_gem_mm {
>  	/** PPGTT used for aliasing the PPGTT with the GTT */
>  	struct i915_hw_ppgtt *aliasing_ppgtt;
>  
> +	struct notifier_block oom_notifier;
>  	struct shrinker shrinker;
>  	bool shrinker_no_lock_stealing;
>  
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index ea93898d51bc..dc8e1ef50bfb 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -31,6 +31,7 @@
>  #include "i915_drv.h"
>  #include "i915_trace.h"
>  #include "intel_drv.h"
> +#include <linux/oom.h>
>  #include <linux/shmem_fs.h>
>  #include <linux/slab.h>
>  #include <linux/swap.h>
> @@ -61,6 +62,9 @@ static unsigned long i915_gem_shrinker_count(struct shrinker *shrinker,
>  					     struct shrink_control *sc);
>  static unsigned long i915_gem_shrinker_scan(struct shrinker *shrinker,
>  					    struct shrink_control *sc);
> +static int i915_gem_shrinker_oom(struct notifier_block *nb,
> +				 unsigned long event,
> +				 void *ptr);
>  static unsigned long i915_gem_purge(struct drm_i915_private *dev_priv, long target);
>  static unsigned long i915_gem_shrink_all(struct drm_i915_private *dev_priv);
>  
> @@ -4759,6 +4763,9 @@ i915_gem_load(struct drm_device *dev)
>  	dev_priv->mm.shrinker.count_objects = i915_gem_shrinker_count;
>  	dev_priv->mm.shrinker.seeks = DEFAULT_SEEKS;
>  	register_shrinker(&dev_priv->mm.shrinker);
> +
> +	dev_priv->mm.oom_notifier.notifier_call = i915_gem_shrinker_oom;
> +	register_oom_notifier(&dev_priv->mm.oom_notifier);
>  }
>  
>  /*
> @@ -5154,15 +5161,76 @@ i915_gem_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc)
>  		freed += __i915_gem_shrink(dev_priv,
>  					   sc->nr_to_scan - freed,
>  					   false);
> -	if (freed < sc->nr_to_scan)
> -		freed += i915_gem_shrink_all(dev_priv);
> -
>  	if (unlock)
>  		mutex_unlock(&dev->struct_mutex);
>  
>  	return freed;
>  }
>  
> +static int
> +i915_gem_shrinker_oom(struct notifier_block *nb, unsigned long event, void *ptr)
> +{
> +	struct drm_i915_private *dev_priv =
> +		container_of(nb, struct drm_i915_private, mm.oom_notifier);
> +	struct drm_device *dev = dev_priv->dev;
> +	struct drm_i915_gem_object *obj;
> +	unsigned long timeout = msecs_to_jiffies(5000) + 1;
> +	unsigned long pinned, bound, unbound, freed;
> +	bool was_interruptible;
> +	bool unlock;
> +
> +	while (!i915_gem_shrinker_lock(dev, &unlock) && --timeout)
> +		schedule_timeout_killable(1);
> +	if (timeout == 0) {
> +		pr_err("Unable to purge GPU memory due lock contention.\n");
> +		return NOTIFY_DONE;
> +	}
> +
> +	was_interruptible = dev_priv->mm.interruptible;
> +	dev_priv->mm.interruptible = false;
> +
> +	freed = i915_gem_shrink_all(dev_priv);
> +
> +	dev_priv->mm.interruptible = was_interruptible;
> +
> +	/* Because we may be allocating inside our own driver, we cannot
> +	 * assert that there are no objects with pinned pages that are not
> +	 * being pointed to by hardware.
> +	 */
> +	unbound = bound = pinned = 0;
> +	list_for_each_entry(obj, &dev_priv->mm.unbound_list, global_list) {
> +		if (!obj->base.filp) /* not backed by a freeable object */
> +			continue;
> +
> +		if (obj->pages_pin_count)
> +			pinned += obj->base.size;
> +		else
> +			unbound += obj->base.size;
> +	}
> +	list_for_each_entry(obj, &dev_priv->mm.bound_list, global_list) {
> +		if (!obj->base.filp)
> +			continue;
> +
> +		if (obj->pages_pin_count)
> +			pinned += obj->base.size;
> +		else
> +			bound += obj->base.size;
> +	}
> +
> +	if (unlock)
> +		mutex_unlock(&dev->struct_mutex);
> +
> +	pr_info("Purging GPU memory, %lu bytes freed, %lu bytes still pinned.\n",
> +		freed, pinned);
> +	if (unbound | bound)

I prefer a boolean || here since we compare against non-zero for each and
not bitfields.

> +		pr_err("%lu and %lu bytes still available in the "
> +		       "bound and unbound GPU page lists.\n",
> +		       bound, unbound);
> +
> +	*(unsigned long *)ptr += freed;
> +	return NOTIFY_DONE;
> +}
> +
>  struct i915_vma *i915_gem_obj_to_ggtt(struct drm_i915_gem_object *obj)
>  {
>  	struct i915_vma *vma;
> -- 
> 2.0.0.rc2
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch



More information about the Intel-gfx mailing list