[Intel-gfx] [PATCH 5/5] drm/i915: Implement an oom-notifier for last resort shrinking

Tue May 20 10:25:13 CEST 2014

On Tue, May 20, 2014 at 08:28:43AM +0100, Chris Wilson wrote:
> Before the process killer is invoked, oom-notifiers are executed for one
> last try at recovering pages. We can hook into this callback to be sure
> that everything that can be is purged from our page lists, and to give a
> summary of how much memory is still pinned by the GPU in the case of an
> oom.
> 
> References: https://bugs.freedesktop.org/show_bug.cgi?id=72742
> Signed-off-by: Chris Wilson <chris at chris-wilson.co.uk>
> Tested-by: lu hua <huax.lu at intel.com>

Dave, can you please have a look at this and ack it from a core mm
perspective? I just like your ack since you've worked together with Chris
on these issues.

Thanks, Daniel

> ---
>  drivers/gpu/drm/i915/i915_drv.h |  1 +
>  drivers/gpu/drm/i915/i915_gem.c | 74 +++++++++++++++++++++++++++++++++++++++--
>  2 files changed, 72 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index e69cb51de738..389204d44431 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1057,6 +1057,7 @@ struct i915_gem_mm {
>  	/** PPGTT used for aliasing the PPGTT with the GTT */
>  	struct i915_hw_ppgtt *aliasing_ppgtt;
>  
> +	struct notifier_block oom_notifier;
>  	struct shrinker shrinker;
>  	bool shrinker_no_lock_stealing;
>  
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index ea93898d51bc..dc8e1ef50bfb 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -31,6 +31,7 @@
>  #include "i915_drv.h"
>  #include "i915_trace.h"
>  #include "intel_drv.h"
> +#include <linux/oom.h>
>  #include <linux/shmem_fs.h>
>  #include <linux/slab.h>
>  #include <linux/swap.h>
> @@ -61,6 +62,9 @@ static unsigned long i915_gem_shrinker_count(struct shrinker *shrinker,
>  					     struct shrink_control *sc);
>  static unsigned long i915_gem_shrinker_scan(struct shrinker *shrinker,
>  					    struct shrink_control *sc);
> +static int i915_gem_shrinker_oom(struct notifier_block *nb,
> +				 unsigned long event,
> +				 void *ptr);
>  static unsigned long i915_gem_purge(struct drm_i915_private *dev_priv, long target);
>  static unsigned long i915_gem_shrink_all(struct drm_i915_private *dev_priv);
>  
> @@ -4759,6 +4763,9 @@ i915_gem_load(struct drm_device *dev)
>  	dev_priv->mm.shrinker.count_objects = i915_gem_shrinker_count;
>  	dev_priv->mm.shrinker.seeks = DEFAULT_SEEKS;
>  	register_shrinker(&dev_priv->mm.shrinker);
> +
> +	dev_priv->mm.oom_notifier.notifier_call = i915_gem_shrinker_oom;
> +	register_oom_notifier(&dev_priv->mm.oom_notifier);
>  }
>  
>  /*
> @@ -5154,15 +5161,76 @@ i915_gem_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc)
>  		freed += __i915_gem_shrink(dev_priv,
>  					   sc->nr_to_scan - freed,
>  					   false);
> -	if (freed < sc->nr_to_scan)
> -		freed += i915_gem_shrink_all(dev_priv);
> -
>  	if (unlock)
>  		mutex_unlock(&dev->struct_mutex);
>  
>  	return freed;
>  }
>  
> +static int
> +i915_gem_shrinker_oom(struct notifier_block *nb, unsigned long event, void *ptr)
> +{
> +	struct drm_i915_private *dev_priv =
> +		container_of(nb, struct drm_i915_private, mm.oom_notifier);
> +	struct drm_device *dev = dev_priv->dev;
> +	struct drm_i915_gem_object *obj;
> +	unsigned long timeout = msecs_to_jiffies(5000) + 1;
> +	unsigned long pinned, bound, unbound, freed;
> +	bool was_interruptible;
> +	bool unlock;
> +
> +	while (!i915_gem_shrinker_lock(dev, &unlock) && --timeout)
> +		schedule_timeout_killable(1);
> +	if (timeout == 0) {
> +		pr_err("Unable to purge GPU memory due lock contention.\n");
> +		return NOTIFY_DONE;
> +	}
> +
> +	was_interruptible = dev_priv->mm.interruptible;
> +	dev_priv->mm.interruptible = false;
> +
> +	freed = i915_gem_shrink_all(dev_priv);
> +
> +	dev_priv->mm.interruptible = was_interruptible;
> +
> +	/* Because we may be allocating inside our own driver, we cannot
> +	 * assert that there are no objects with pinned pages that are not
> +	 * being pointed to by hardware.
> +	 */
> +	unbound = bound = pinned = 0;
> +	list_for_each_entry(obj, &dev_priv->mm.unbound_list, global_list) {
> +		if (!obj->base.filp) /* not backed by a freeable object */
> +			continue;
> +
> +		if (obj->pages_pin_count)
> +			pinned += obj->base.size;
> +		else
> +			unbound += obj->base.size;
> +	}
> +	list_for_each_entry(obj, &dev_priv->mm.bound_list, global_list) {
> +		if (!obj->base.filp)
> +			continue;
> +
> +		if (obj->pages_pin_count)
> +			pinned += obj->base.size;
> +		else
> +			bound += obj->base.size;
> +	}
> +
> +	if (unlock)
> +		mutex_unlock(&dev->struct_mutex);
> +
> +	pr_info("Purging GPU memory, %lu bytes freed, %lu bytes still pinned.\n",
> +		freed, pinned);
> +	if (unbound | bound)
> +		pr_err("%lu and %lu bytes still available in the "
> +		       "bound and unbound GPU page lists.\n",
> +		       bound, unbound);
> +
> +	*(unsigned long *)ptr += freed;
> +	return NOTIFY_DONE;
> +}
> +
>  struct i915_vma *i915_gem_obj_to_ggtt(struct drm_i915_gem_object *obj)
>  {
>  	struct i915_vma *vma;
> -- 
> 2.0.0.rc2
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch