From: Rob Clark robdclark@chromium.org
One would normally hope not to be under enough memory pressure to need to swap GEM objects to disk backed swap. But memory backed zram swap (as enabled on chromebooks, for example) can actually be quite fast and useful on devices with less RAM. On a 4GB device, opening up ~4 memory intensive web pages (in separate windows rather than tabs, to try and prevent tab discard), I see ~500MB worth of GEM objects, of which maybe only 10% are active at any time, and with unpin/evict enabled, only about half resident (which is a number that gets much lower if you simulate extreme memory pressure). Assuming a 2:1 compression ratio (I see a bit higher in practice, but cannot isolate swapped out GEM pages vs other), that is like having an extra 100+MB of RAM, or more under higher memory pressure.
Rob Clark (8): drm/msm: ratelimit GEM related WARN_ON()s drm/msm: Reorganize msm_gem_shrinker_scan() drm/msm: Clear msm_obj->sgt in put_pages() drm/msm: Split iova purge and close drm/msm: Add $debugfs/gem stats on resident objects drm/msm: Track potentially evictable objects drm/msm: Small msm_gem_purge() fix drm/msm: Support evicting GEM objects to swap
drivers/gpu/drm/msm/msm_drv.c | 2 +- drivers/gpu/drm/msm/msm_drv.h | 13 ++- drivers/gpu/drm/msm/msm_gem.c | 155 +++++++++++++++++-------- drivers/gpu/drm/msm/msm_gem.h | 68 +++++++++-- drivers/gpu/drm/msm/msm_gem_shrinker.c | 129 ++++++++++++-------- drivers/gpu/drm/msm/msm_gpu_trace.h | 13 +++ 6 files changed, 272 insertions(+), 108 deletions(-)
From: Rob Clark robdclark@chromium.org
If you mess something up, you don't really need to see the same warn on splat 4000 times pumped out a slow debug UART port..
Signed-off-by: Rob Clark robdclark@chromium.org --- drivers/gpu/drm/msm/msm_gem.c | 66 +++++++++++++++++------------------ drivers/gpu/drm/msm/msm_gem.h | 19 ++++++---- 2 files changed, 45 insertions(+), 40 deletions(-)
diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c index 4e91b095ab77..d5abe8aa9978 100644 --- a/drivers/gpu/drm/msm/msm_gem.c +++ b/drivers/gpu/drm/msm/msm_gem.c @@ -96,7 +96,7 @@ static struct page **get_pages(struct drm_gem_object *obj) { struct msm_gem_object *msm_obj = to_msm_bo(obj);
- WARN_ON(!msm_gem_is_locked(obj)); + GEM_WARN_ON(!msm_gem_is_locked(obj));
if (!msm_obj->pages) { struct drm_device *dev = obj->dev; @@ -180,7 +180,7 @@ struct page **msm_gem_get_pages(struct drm_gem_object *obj)
msm_gem_lock(obj);
- if (WARN_ON(msm_obj->madv != MSM_MADV_WILLNEED)) { + if (GEM_WARN_ON(msm_obj->madv != MSM_MADV_WILLNEED)) { msm_gem_unlock(obj); return ERR_PTR(-EBUSY); } @@ -256,7 +256,7 @@ static vm_fault_t msm_gem_fault(struct vm_fault *vmf) goto out; }
- if (WARN_ON(msm_obj->madv != MSM_MADV_WILLNEED)) { + if (GEM_WARN_ON(msm_obj->madv != MSM_MADV_WILLNEED)) { msm_gem_unlock(obj); return VM_FAULT_SIGBUS; } @@ -289,7 +289,7 @@ static uint64_t mmap_offset(struct drm_gem_object *obj) struct drm_device *dev = obj->dev; int ret;
- WARN_ON(!msm_gem_is_locked(obj)); + GEM_WARN_ON(!msm_gem_is_locked(obj));
/* Make it mmapable */ ret = drm_gem_create_mmap_offset(obj); @@ -318,7 +318,7 @@ static struct msm_gem_vma *add_vma(struct drm_gem_object *obj, struct msm_gem_object *msm_obj = to_msm_bo(obj); struct msm_gem_vma *vma;
- WARN_ON(!msm_gem_is_locked(obj)); + GEM_WARN_ON(!msm_gem_is_locked(obj));
vma = kzalloc(sizeof(*vma), GFP_KERNEL); if (!vma) @@ -337,7 +337,7 @@ static struct msm_gem_vma *lookup_vma(struct drm_gem_object *obj, struct msm_gem_object *msm_obj = to_msm_bo(obj); struct msm_gem_vma *vma;
- WARN_ON(!msm_gem_is_locked(obj)); + GEM_WARN_ON(!msm_gem_is_locked(obj));
list_for_each_entry(vma, &msm_obj->vmas, list) { if (vma->aspace == aspace) @@ -363,7 +363,7 @@ put_iova_spaces(struct drm_gem_object *obj) struct msm_gem_object *msm_obj = to_msm_bo(obj); struct msm_gem_vma *vma;
- WARN_ON(!msm_gem_is_locked(obj)); + GEM_WARN_ON(!msm_gem_is_locked(obj));
list_for_each_entry(vma, &msm_obj->vmas, list) { if (vma->aspace) { @@ -380,7 +380,7 @@ put_iova_vmas(struct drm_gem_object *obj) struct msm_gem_object *msm_obj = to_msm_bo(obj); struct msm_gem_vma *vma, *tmp;
- WARN_ON(!msm_gem_is_locked(obj)); + GEM_WARN_ON(!msm_gem_is_locked(obj));
list_for_each_entry_safe(vma, tmp, &msm_obj->vmas, list) { del_vma(vma); @@ -394,7 +394,7 @@ static int get_iova_locked(struct drm_gem_object *obj, struct msm_gem_vma *vma; int ret = 0;
- WARN_ON(!msm_gem_is_locked(obj)); + GEM_WARN_ON(!msm_gem_is_locked(obj));
vma = lookup_vma(obj, aspace);
@@ -429,13 +429,13 @@ static int msm_gem_pin_iova(struct drm_gem_object *obj, if (msm_obj->flags & MSM_BO_MAP_PRIV) prot |= IOMMU_PRIV;
- WARN_ON(!msm_gem_is_locked(obj)); + GEM_WARN_ON(!msm_gem_is_locked(obj));
- if (WARN_ON(msm_obj->madv != MSM_MADV_WILLNEED)) + if (GEM_WARN_ON(msm_obj->madv != MSM_MADV_WILLNEED)) return -EBUSY;
vma = lookup_vma(obj, aspace); - if (WARN_ON(!vma)) + if (GEM_WARN_ON(!vma)) return -EINVAL;
pages = get_pages(obj); @@ -453,7 +453,7 @@ static int get_and_pin_iova_range_locked(struct drm_gem_object *obj, u64 local; int ret;
- WARN_ON(!msm_gem_is_locked(obj)); + GEM_WARN_ON(!msm_gem_is_locked(obj));
ret = get_iova_locked(obj, aspace, &local, range_start, range_end); @@ -524,7 +524,7 @@ uint64_t msm_gem_iova(struct drm_gem_object *obj, msm_gem_lock(obj); vma = lookup_vma(obj, aspace); msm_gem_unlock(obj); - WARN_ON(!vma); + GEM_WARN_ON(!vma);
return vma ? vma->iova : 0; } @@ -537,11 +537,11 @@ void msm_gem_unpin_iova_locked(struct drm_gem_object *obj, { struct msm_gem_vma *vma;
- WARN_ON(!msm_gem_is_locked(obj)); + GEM_WARN_ON(!msm_gem_is_locked(obj));
vma = lookup_vma(obj, aspace);
- if (!WARN_ON(!vma)) + if (!GEM_WARN_ON(!vma)) msm_gem_unmap_vma(aspace, vma); }
@@ -593,12 +593,12 @@ static void *get_vaddr(struct drm_gem_object *obj, unsigned madv) struct msm_gem_object *msm_obj = to_msm_bo(obj); int ret = 0;
- WARN_ON(!msm_gem_is_locked(obj)); + GEM_WARN_ON(!msm_gem_is_locked(obj));
if (obj->import_attach) return ERR_PTR(-ENODEV);
- if (WARN_ON(msm_obj->madv > madv)) { + if (GEM_WARN_ON(msm_obj->madv > madv)) { DRM_DEV_ERROR(obj->dev->dev, "Invalid madv state: %u vs %u\n", msm_obj->madv, madv); return ERR_PTR(-EBUSY); @@ -664,8 +664,8 @@ void msm_gem_put_vaddr_locked(struct drm_gem_object *obj) { struct msm_gem_object *msm_obj = to_msm_bo(obj);
- WARN_ON(!msm_gem_is_locked(obj)); - WARN_ON(msm_obj->vmap_count < 1); + GEM_WARN_ON(!msm_gem_is_locked(obj)); + GEM_WARN_ON(msm_obj->vmap_count < 1);
msm_obj->vmap_count--; } @@ -707,8 +707,8 @@ void msm_gem_purge(struct drm_gem_object *obj) struct drm_device *dev = obj->dev; struct msm_gem_object *msm_obj = to_msm_bo(obj);
- WARN_ON(!is_purgeable(msm_obj)); - WARN_ON(obj->import_attach); + GEM_WARN_ON(!is_purgeable(msm_obj)); + GEM_WARN_ON(obj->import_attach);
put_iova_spaces(obj);
@@ -739,9 +739,9 @@ void msm_gem_vunmap(struct drm_gem_object *obj) { struct msm_gem_object *msm_obj = to_msm_bo(obj);
- WARN_ON(!msm_gem_is_locked(obj)); + GEM_WARN_ON(!msm_gem_is_locked(obj));
- if (!msm_obj->vaddr || WARN_ON(!is_vunmapable(msm_obj))) + if (!msm_obj->vaddr || GEM_WARN_ON(!is_vunmapable(msm_obj))) return;
vunmap(msm_obj->vaddr); @@ -789,9 +789,9 @@ void msm_gem_active_get(struct drm_gem_object *obj, struct msm_gpu *gpu) struct msm_drm_private *priv = obj->dev->dev_private;
might_sleep(); - WARN_ON(!msm_gem_is_locked(obj)); - WARN_ON(msm_obj->madv != MSM_MADV_WILLNEED); - WARN_ON(msm_obj->dontneed); + GEM_WARN_ON(!msm_gem_is_locked(obj)); + GEM_WARN_ON(msm_obj->madv != MSM_MADV_WILLNEED); + GEM_WARN_ON(msm_obj->dontneed);
if (msm_obj->active_count++ == 0) { mutex_lock(&priv->mm_lock); @@ -806,7 +806,7 @@ void msm_gem_active_put(struct drm_gem_object *obj) struct msm_gem_object *msm_obj = to_msm_bo(obj);
might_sleep(); - WARN_ON(!msm_gem_is_locked(obj)); + GEM_WARN_ON(!msm_gem_is_locked(obj));
if (--msm_obj->active_count == 0) { update_inactive(msm_obj); @@ -818,7 +818,7 @@ static void update_inactive(struct msm_gem_object *msm_obj) struct msm_drm_private *priv = msm_obj->base.dev->dev_private;
mutex_lock(&priv->mm_lock); - WARN_ON(msm_obj->active_count != 0); + GEM_WARN_ON(msm_obj->active_count != 0);
if (msm_obj->dontneed) mark_unpurgable(msm_obj); @@ -830,7 +830,7 @@ static void update_inactive(struct msm_gem_object *msm_obj) list_add_tail(&msm_obj->mm_list, &priv->inactive_dontneed); mark_purgable(msm_obj); } else { - WARN_ON(msm_obj->madv != __MSM_MADV_PURGED); + GEM_WARN_ON(msm_obj->madv != __MSM_MADV_PURGED); list_add_tail(&msm_obj->mm_list, &priv->inactive_purged); }
@@ -1010,12 +1010,12 @@ void msm_gem_free_object(struct drm_gem_object *obj) msm_gem_lock(obj);
/* object should not be on active list: */ - WARN_ON(is_active(msm_obj)); + GEM_WARN_ON(is_active(msm_obj));
put_iova_spaces(obj);
if (obj->import_attach) { - WARN_ON(msm_obj->vaddr); + GEM_WARN_ON(msm_obj->vaddr);
/* Don't drop the pages for imported dmabuf, as they are not * ours, just free the array we allocated: @@ -1131,7 +1131,7 @@ static struct drm_gem_object *_msm_gem_new(struct drm_device *dev, else if ((flags & (MSM_BO_STOLEN | MSM_BO_SCANOUT)) && priv->vram.size) use_vram = true;
- if (WARN_ON(use_vram && !priv->vram.size)) + if (GEM_WARN_ON(use_vram && !priv->vram.size)) return ERR_PTR(-EINVAL);
/* Disallow zero sized objects as they make the underlying diff --git a/drivers/gpu/drm/msm/msm_gem.h b/drivers/gpu/drm/msm/msm_gem.h index 7c7d54bad189..917af526a5c5 100644 --- a/drivers/gpu/drm/msm/msm_gem.h +++ b/drivers/gpu/drm/msm/msm_gem.h @@ -11,6 +11,11 @@ #include <linux/dma-resv.h> #include "msm_drv.h"
+/* Make all GEM related WARN_ON()s ratelimited.. when things go wrong they + * tend to go wrong 1000s of times in a short timespan. + */ +#define GEM_WARN_ON(x) WARN_RATELIMIT(x, "%s", __stringify(x)) + /* Additional internal-use only BO flags: */ #define MSM_BO_STOLEN 0x10000000 /* try to use stolen/splash memory */ #define MSM_BO_MAP_PRIV 0x20000000 /* use IOMMU_PRIV when mapping */ @@ -203,7 +208,7 @@ msm_gem_is_locked(struct drm_gem_object *obj)
static inline bool is_active(struct msm_gem_object *msm_obj) { - WARN_ON(!msm_gem_is_locked(&msm_obj->base)); + GEM_WARN_ON(!msm_gem_is_locked(&msm_obj->base)); return msm_obj->active_count; }
@@ -221,7 +226,7 @@ static inline bool is_purgeable(struct msm_gem_object *msm_obj)
static inline bool is_vunmapable(struct msm_gem_object *msm_obj) { - WARN_ON(!msm_gem_is_locked(&msm_obj->base)); + GEM_WARN_ON(!msm_gem_is_locked(&msm_obj->base)); return (msm_obj->vmap_count == 0) && msm_obj->vaddr; }
@@ -229,12 +234,12 @@ static inline void mark_purgable(struct msm_gem_object *msm_obj) { struct msm_drm_private *priv = msm_obj->base.dev->dev_private;
- WARN_ON(!mutex_is_locked(&priv->mm_lock)); + GEM_WARN_ON(!mutex_is_locked(&priv->mm_lock));
if (is_unpurgable(msm_obj)) return;
- if (WARN_ON(msm_obj->dontneed)) + if (GEM_WARN_ON(msm_obj->dontneed)) return;
priv->shrinkable_count += msm_obj->base.size >> PAGE_SHIFT; @@ -245,16 +250,16 @@ static inline void mark_unpurgable(struct msm_gem_object *msm_obj) { struct msm_drm_private *priv = msm_obj->base.dev->dev_private;
- WARN_ON(!mutex_is_locked(&priv->mm_lock)); + GEM_WARN_ON(!mutex_is_locked(&priv->mm_lock));
if (is_unpurgable(msm_obj)) return;
- if (WARN_ON(!msm_obj->dontneed)) + if (GEM_WARN_ON(!msm_obj->dontneed)) return;
priv->shrinkable_count -= msm_obj->base.size >> PAGE_SHIFT; - WARN_ON(priv->shrinkable_count < 0); + GEM_WARN_ON(priv->shrinkable_count < 0); msm_obj->dontneed = false; }
From: Rob Clark robdclark@chromium.org
So we don't have to duplicate the boilerplate for eviction.
This also lets us re-use the main scan loop for vmap shrinker.
Signed-off-by: Rob Clark robdclark@chromium.org --- drivers/gpu/drm/msm/msm_gem_shrinker.c | 94 +++++++++++++------------- 1 file changed, 46 insertions(+), 48 deletions(-)
diff --git a/drivers/gpu/drm/msm/msm_gem_shrinker.c b/drivers/gpu/drm/msm/msm_gem_shrinker.c index 33a49641ef30..38bf919f8508 100644 --- a/drivers/gpu/drm/msm/msm_gem_shrinker.c +++ b/drivers/gpu/drm/msm/msm_gem_shrinker.c @@ -17,21 +17,35 @@ msm_gem_shrinker_count(struct shrinker *shrinker, struct shrink_control *sc) return priv->shrinkable_count; }
+static bool +purge(struct msm_gem_object *msm_obj) +{ + if (!is_purgeable(msm_obj)) + return false; + + /* + * This will move the obj out of still_in_list to + * the purged list + */ + msm_gem_purge(&msm_obj->base); + + return true; +} + static unsigned long -msm_gem_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc) +scan(struct msm_drm_private *priv, unsigned nr_to_scan, struct list_head *list, + bool (*shrink)(struct msm_gem_object *msm_obj)) { - struct msm_drm_private *priv = - container_of(shrinker, struct msm_drm_private, shrinker); + unsigned freed = 0; struct list_head still_in_list; - unsigned long freed = 0;
INIT_LIST_HEAD(&still_in_list);
mutex_lock(&priv->mm_lock);
- while (freed < sc->nr_to_scan) { + while (freed < nr_to_scan) { struct msm_gem_object *msm_obj = list_first_entry_or_null( - &priv->inactive_dontneed, typeof(*msm_obj), mm_list); + list, typeof(*msm_obj), mm_list);
if (!msm_obj) break; @@ -62,14 +76,9 @@ msm_gem_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc) if (!msm_gem_trylock(&msm_obj->base)) goto tail;
- if (is_purgeable(msm_obj)) { - /* - * This will move the obj out of still_in_list to - * the purged list - */ - msm_gem_purge(&msm_obj->base); + if (shrink(msm_obj)) freed += msm_obj->base.size >> PAGE_SHIFT; - } + msm_gem_unlock(&msm_obj->base);
tail: @@ -77,16 +86,25 @@ msm_gem_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc) mutex_lock(&priv->mm_lock); }
- list_splice_tail(&still_in_list, &priv->inactive_dontneed); + list_splice_tail(&still_in_list, list); mutex_unlock(&priv->mm_lock);
- if (freed > 0) { + return freed; +} + +static unsigned long +msm_gem_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc) +{ + struct msm_drm_private *priv = + container_of(shrinker, struct msm_drm_private, shrinker); + unsigned long freed; + + freed = scan(priv, sc->nr_to_scan, &priv->inactive_dontneed, purge); + + if (freed > 0) trace_msm_gem_purge(freed << PAGE_SHIFT); - } else { - return SHRINK_STOP; - }
- return freed; + return (freed > 0) ? freed : SHRINK_STOP; }
/* since we don't know any better, lets bail after a few @@ -95,29 +113,15 @@ msm_gem_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc) */ static const int vmap_shrink_limit = 15;
-static unsigned -vmap_shrink(struct list_head *mm_list) +static bool +vmap_shrink(struct msm_gem_object *msm_obj) { - struct msm_gem_object *msm_obj; - unsigned unmapped = 0; + if (!is_vunmapable(msm_obj)) + return false;
- list_for_each_entry(msm_obj, mm_list, mm_list) { - /* Use trylock, because we cannot block on a obj that - * might be trying to acquire mm_lock - */ - if (!msm_gem_trylock(&msm_obj->base)) - continue; - if (is_vunmapable(msm_obj)) { - msm_gem_vunmap(&msm_obj->base); - unmapped++; - } - msm_gem_unlock(&msm_obj->base); + msm_gem_vunmap(&msm_obj->base);
- if (++unmapped >= vmap_shrink_limit) - break; - } - - return unmapped; + return true; }
static int @@ -133,17 +137,11 @@ msm_gem_shrinker_vmap(struct notifier_block *nb, unsigned long event, void *ptr) }; unsigned idx, unmapped = 0;
- mutex_lock(&priv->mm_lock); - - for (idx = 0; mm_lists[idx]; idx++) { - unmapped += vmap_shrink(mm_lists[idx]); - - if (unmapped >= vmap_shrink_limit) - break; + for (idx = 0; mm_lists[idx] && unmapped < vmap_shrink_limit; idx++) { + unmapped += scan(priv, vmap_shrink_limit - unmapped, + mm_lists[idx], vmap_shrink); }
- mutex_unlock(&priv->mm_lock); - *(unsigned long *)ptr += unmapped;
if (unmapped > 0)
From: Rob Clark robdclark@chromium.org
Currently this doesn't matter since we keep the pages pinned until the object is destroyed. But when we start unpinning pages to allow objects to be evicted to swap, it will.
Signed-off-by: Rob Clark robdclark@chromium.org --- drivers/gpu/drm/msm/msm_gem.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c index d5abe8aa9978..71530a89b675 100644 --- a/drivers/gpu/drm/msm/msm_gem.c +++ b/drivers/gpu/drm/msm/msm_gem.c @@ -162,6 +162,7 @@ static void put_pages(struct drm_gem_object *obj)
sg_free_table(msm_obj->sgt); kfree(msm_obj->sgt); + msm_obj->sgt = NULL; }
if (use_pages(obj))
From: Rob Clark robdclark@chromium.org
Currently these always go together, either when we purge MADV_WONTNEED objects or when the object is freed. But for unpin, we want to be able to purge (unmap from iommu) the vma, while keeping the iova range allocated (so we can remap back to the same GPU virtual address when the object is re-pinned.
Signed-off-by: Rob Clark robdclark@chromium.org --- drivers/gpu/drm/msm/msm_gem.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-)
diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c index 71530a89b675..5f0647adc29d 100644 --- a/drivers/gpu/drm/msm/msm_gem.c +++ b/drivers/gpu/drm/msm/msm_gem.c @@ -357,9 +357,14 @@ static void del_vma(struct msm_gem_vma *vma) kfree(vma); }
-/* Called with msm_obj locked */ +/** + * If close is true, this also closes the VMA (releasing the allocated + * iova range) in addition to removing the iommu mapping. In the eviction + * case (!close), we keep the iova allocated, but only remove the iommu + * mapping. + */ static void -put_iova_spaces(struct drm_gem_object *obj) +put_iova_spaces(struct drm_gem_object *obj, bool close) { struct msm_gem_object *msm_obj = to_msm_bo(obj); struct msm_gem_vma *vma; @@ -369,7 +374,8 @@ put_iova_spaces(struct drm_gem_object *obj) list_for_each_entry(vma, &msm_obj->vmas, list) { if (vma->aspace) { msm_gem_purge_vma(vma->aspace, vma); - msm_gem_close_vma(vma->aspace, vma); + if (close) + msm_gem_close_vma(vma->aspace, vma); } } } @@ -711,7 +717,8 @@ void msm_gem_purge(struct drm_gem_object *obj) GEM_WARN_ON(!is_purgeable(msm_obj)); GEM_WARN_ON(obj->import_attach);
- put_iova_spaces(obj); + /* Get rid of any iommu mapping(s): */ + put_iova_spaces(obj, true);
msm_gem_vunmap(obj);
@@ -1013,7 +1020,7 @@ void msm_gem_free_object(struct drm_gem_object *obj) /* object should not be on active list: */ GEM_WARN_ON(is_active(msm_obj));
- put_iova_spaces(obj); + put_iova_spaces(obj, true);
if (obj->import_attach) { GEM_WARN_ON(msm_obj->vaddr);
From: Rob Clark robdclark@chromium.org
Currently nearly everything, other than newly allocated objects which are not yet backed by pages, is pinned and resident in RAM. But it will be nice to have some stats on what is unpinned once that is supported.
Signed-off-by: Rob Clark robdclark@chromium.org --- drivers/gpu/drm/msm/msm_gem.c | 7 +++++++ drivers/gpu/drm/msm/msm_gem.h | 4 ++-- 2 files changed, 9 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c index 5f0647adc29d..9ff37904ec2b 100644 --- a/drivers/gpu/drm/msm/msm_gem.c +++ b/drivers/gpu/drm/msm/msm_gem.c @@ -902,6 +902,11 @@ void msm_gem_describe(struct drm_gem_object *obj, struct seq_file *m, stats->active.size += obj->size; }
+ if (msm_obj->pages) { + stats->resident.count++; + stats->resident.size += obj->size; + } + switch (msm_obj->madv) { case __MSM_MADV_PURGED: stats->purged.count++; @@ -991,6 +996,8 @@ void msm_gem_describe_objects(struct list_head *list, struct seq_file *m) stats.all.count, stats.all.size); seq_printf(m, "Active: %4d objects, %9zu bytes\n", stats.active.count, stats.active.size); + seq_printf(m, "Resident: %4d objects, %9zu bytes\n", + stats.resident.count, stats.resident.size); seq_printf(m, "Purgable: %4d objects, %9zu bytes\n", stats.purgable.count, stats.purgable.size); seq_printf(m, "Purged: %4d objects, %9zu bytes\n", diff --git a/drivers/gpu/drm/msm/msm_gem.h b/drivers/gpu/drm/msm/msm_gem.h index 917af526a5c5..e13a9301b616 100644 --- a/drivers/gpu/drm/msm/msm_gem.h +++ b/drivers/gpu/drm/msm/msm_gem.h @@ -162,13 +162,13 @@ struct drm_gem_object *msm_gem_import(struct drm_device *dev, struct dma_buf *dmabuf, struct sg_table *sgt); __printf(2, 3) void msm_gem_object_set_name(struct drm_gem_object *bo, const char *fmt, ...); -#ifdef CONFIG_DEBUG_FS
+#ifdef CONFIG_DEBUG_FS struct msm_gem_stats { struct { unsigned count; size_t size; - } all, active, purgable, purged; + } all, active, resident, purgable, purged; };
void msm_gem_describe(struct drm_gem_object *obj, struct seq_file *m,
From: Rob Clark robdclark@chromium.org
Objects that are potential for swapping out are (1) willneed (ie. if they are purgable/MADV_WONTNEED we can just free the pages without them having to land in swap), (2) not on an active list, (3) not dma-buf imported or exported, and (4) not vmap'd. This repurposes the purged list for objects that do not have backing pages (either because they have not been pinned for the first time yet, or in a later patch because they have been unpinned/evicted.
Signed-off-by: Rob Clark robdclark@chromium.org --- drivers/gpu/drm/msm/msm_drv.c | 2 +- drivers/gpu/drm/msm/msm_drv.h | 13 ++++++---- drivers/gpu/drm/msm/msm_gem.c | 44 ++++++++++++++++++++++++++-------- drivers/gpu/drm/msm/msm_gem.h | 45 +++++++++++++++++++++++++++++++++++ 4 files changed, 89 insertions(+), 15 deletions(-)
diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c index e12d5fbd0a34..d3d6c743b7af 100644 --- a/drivers/gpu/drm/msm/msm_drv.c +++ b/drivers/gpu/drm/msm/msm_drv.c @@ -451,7 +451,7 @@ static int msm_drm_init(struct device *dev, const struct drm_driver *drv)
INIT_LIST_HEAD(&priv->inactive_willneed); INIT_LIST_HEAD(&priv->inactive_dontneed); - INIT_LIST_HEAD(&priv->inactive_purged); + INIT_LIST_HEAD(&priv->inactive_unpinned); mutex_init(&priv->mm_lock);
/* Teach lockdep about lock ordering wrt. shrinker: */ diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h index 6a42cdf4cf7e..2668941df529 100644 --- a/drivers/gpu/drm/msm/msm_drv.h +++ b/drivers/gpu/drm/msm/msm_drv.h @@ -182,11 +182,15 @@ struct msm_drm_private { struct mutex obj_lock;
/** - * Lists of inactive GEM objects. Every bo is either in one of the + * LRUs of inactive GEM objects. Every bo is either in one of the * inactive lists (depending on whether or not it is shrinkable) or * gpu->active_list (for the gpu it is active on[1]), or transiently * on a temporary list as the shrinker is running. * + * Note that inactive_willneed also contains pinned and vmap'd bos, + * but the number of pinned-but-not-active objects is small (scanout + * buffers, ringbuffer, etc). + * * These lists are protected by mm_lock (which should be acquired * before per GEM object lock). One should *not* hold mm_lock in * get_pages()/vmap()/etc paths, as they can trigger the shrinker. @@ -194,10 +198,11 @@ struct msm_drm_private { * [1] if someone ever added support for the old 2d cores, there could be * more than one gpu object */ - struct list_head inactive_willneed; /* inactive + !shrinkable */ - struct list_head inactive_dontneed; /* inactive + shrinkable */ - struct list_head inactive_purged; /* inactive + purged */ + struct list_head inactive_willneed; /* inactive + potentially unpin/evictable */ + struct list_head inactive_dontneed; /* inactive + shrinkable */ + struct list_head inactive_unpinned; /* inactive + purged or unpinned */ long shrinkable_count; /* write access under mm_lock */ + long evictable_count; /* write access under mm_lock */ struct mutex mm_lock;
struct workqueue_struct *wq; diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c index 9ff37904ec2b..9ac89951080c 100644 --- a/drivers/gpu/drm/msm/msm_gem.c +++ b/drivers/gpu/drm/msm/msm_gem.c @@ -130,6 +130,9 @@ static struct page **get_pages(struct drm_gem_object *obj) */ if (msm_obj->flags & (MSM_BO_WC|MSM_BO_UNCACHED)) sync_for_device(msm_obj); + + GEM_WARN_ON(msm_obj->active_count); + update_inactive(msm_obj); }
return msm_obj->pages; @@ -428,7 +431,7 @@ static int msm_gem_pin_iova(struct drm_gem_object *obj, struct msm_gem_object *msm_obj = to_msm_bo(obj); struct msm_gem_vma *vma; struct page **pages; - int prot = IOMMU_READ; + int ret, prot = IOMMU_READ;
if (!(msm_obj->flags & MSM_BO_GPU_READONLY)) prot |= IOMMU_WRITE; @@ -449,8 +452,13 @@ static int msm_gem_pin_iova(struct drm_gem_object *obj, if (IS_ERR(pages)) return PTR_ERR(pages);
- return msm_gem_map_vma(aspace, vma, prot, + ret = msm_gem_map_vma(aspace, vma, prot, msm_obj->sgt, obj->size >> PAGE_SHIFT); + + if (!ret) + msm_obj->pin_count++; + + return ret; }
static int get_and_pin_iova_range_locked(struct drm_gem_object *obj, @@ -542,14 +550,21 @@ uint64_t msm_gem_iova(struct drm_gem_object *obj, void msm_gem_unpin_iova_locked(struct drm_gem_object *obj, struct msm_gem_address_space *aspace) { + struct msm_gem_object *msm_obj = to_msm_bo(obj); struct msm_gem_vma *vma;
GEM_WARN_ON(!msm_gem_is_locked(obj));
vma = lookup_vma(obj, aspace);
- if (!GEM_WARN_ON(!vma)) + if (!GEM_WARN_ON(!vma)) { msm_gem_unmap_vma(aspace, vma); + + msm_obj->pin_count--; + GEM_WARN_ON(msm_obj->pin_count < 0); + + update_inactive(msm_obj); + } }
/* @@ -800,9 +815,12 @@ void msm_gem_active_get(struct drm_gem_object *obj, struct msm_gpu *gpu) GEM_WARN_ON(!msm_gem_is_locked(obj)); GEM_WARN_ON(msm_obj->madv != MSM_MADV_WILLNEED); GEM_WARN_ON(msm_obj->dontneed); + GEM_WARN_ON(!msm_obj->sgt);
if (msm_obj->active_count++ == 0) { mutex_lock(&priv->mm_lock); + if (msm_obj->evictable) + mark_unevictable(msm_obj); list_del(&msm_obj->mm_list); list_add_tail(&msm_obj->mm_list, &gpu->active_list); mutex_unlock(&priv->mm_lock); @@ -825,21 +843,28 @@ static void update_inactive(struct msm_gem_object *msm_obj) { struct msm_drm_private *priv = msm_obj->base.dev->dev_private;
+ GEM_WARN_ON(!msm_gem_is_locked(&msm_obj->base)); + + if (msm_obj->active_count != 0) + return; + mutex_lock(&priv->mm_lock); - GEM_WARN_ON(msm_obj->active_count != 0);
if (msm_obj->dontneed) mark_unpurgable(msm_obj); + if (msm_obj->evictable) + mark_unevictable(msm_obj);
list_del(&msm_obj->mm_list); - if (msm_obj->madv == MSM_MADV_WILLNEED) { + if ((msm_obj->madv == MSM_MADV_WILLNEED) && msm_obj->sgt) { list_add_tail(&msm_obj->mm_list, &priv->inactive_willneed); + mark_evictable(msm_obj); } else if (msm_obj->madv == MSM_MADV_DONTNEED) { list_add_tail(&msm_obj->mm_list, &priv->inactive_dontneed); mark_purgable(msm_obj); } else { - GEM_WARN_ON(msm_obj->madv != __MSM_MADV_PURGED); - list_add_tail(&msm_obj->mm_list, &priv->inactive_purged); + GEM_WARN_ON((msm_obj->madv != __MSM_MADV_PURGED) && msm_obj->sgt); + list_add_tail(&msm_obj->mm_list, &priv->inactive_unpinned); }
mutex_unlock(&priv->mm_lock); @@ -1201,8 +1226,7 @@ static struct drm_gem_object *_msm_gem_new(struct drm_device *dev, }
mutex_lock(&priv->mm_lock); - /* Initially obj is idle, obj->madv == WILLNEED: */ - list_add_tail(&msm_obj->mm_list, &priv->inactive_willneed); + list_add_tail(&msm_obj->mm_list, &priv->inactive_unpinned); mutex_unlock(&priv->mm_lock);
mutex_lock(&priv->obj_lock); @@ -1276,7 +1300,7 @@ struct drm_gem_object *msm_gem_import(struct drm_device *dev, msm_gem_unlock(obj);
mutex_lock(&priv->mm_lock); - list_add_tail(&msm_obj->mm_list, &priv->inactive_willneed); + list_add_tail(&msm_obj->mm_list, &priv->inactive_unpinned); mutex_unlock(&priv->mm_lock);
mutex_lock(&priv->obj_lock); diff --git a/drivers/gpu/drm/msm/msm_gem.h b/drivers/gpu/drm/msm/msm_gem.h index e13a9301b616..39b2e5584f97 100644 --- a/drivers/gpu/drm/msm/msm_gem.h +++ b/drivers/gpu/drm/msm/msm_gem.h @@ -60,6 +60,11 @@ struct msm_gem_object { */ bool dontneed : 1;
+ /** + * Is object evictable (ie. counted in priv->evictable_count)? + */ + bool evictable : 1; + /** * count of active vmap'ing */ @@ -103,6 +108,7 @@ struct msm_gem_object { char name[32]; /* Identifier to print for the debugfs files */
int active_count; + int pin_count; }; #define to_msm_bo(x) container_of(x, struct msm_gem_object, base)
@@ -263,7 +269,46 @@ static inline void mark_unpurgable(struct msm_gem_object *msm_obj) msm_obj->dontneed = false; }
+static inline bool is_unevictable(struct msm_gem_object *msm_obj) +{ + return is_unpurgable(msm_obj) || msm_obj->pin_count || msm_obj->vaddr; +} + +static inline void mark_evictable(struct msm_gem_object *msm_obj) +{ + struct msm_drm_private *priv = msm_obj->base.dev->dev_private; + + WARN_ON(!mutex_is_locked(&priv->mm_lock)); + + if (is_unevictable(msm_obj)) + return; + + if (WARN_ON(msm_obj->evictable)) + return; + + priv->evictable_count += msm_obj->base.size >> PAGE_SHIFT; + msm_obj->evictable = true; +} + +static inline void mark_unevictable(struct msm_gem_object *msm_obj) +{ + struct msm_drm_private *priv = msm_obj->base.dev->dev_private; + + WARN_ON(!mutex_is_locked(&priv->mm_lock)); + + if (is_unevictable(msm_obj)) + return; + + if (WARN_ON(!msm_obj->evictable)) + return; + + priv->evictable_count -= msm_obj->base.size >> PAGE_SHIFT; + WARN_ON(priv->evictable_count < 0); + msm_obj->evictable = false; +} + void msm_gem_purge(struct drm_gem_object *obj); +void msm_gem_evict(struct drm_gem_object *obj); void msm_gem_vunmap(struct drm_gem_object *obj);
/* Created per submit-ioctl, to track bo's and cmdstream bufs, etc,
From: Rob Clark robdclark@chromium.org
Shoot down any mmap's *first* before put_pages(). Also add a WARN_ON that the object is locked (to make it clear that this doesn't race with msm_gem_fault()) and remove a redundant WARN_ON (since is_purgable() already covers that case).
Fixes: 68209390f116 ("drm/msm: shrinker support") Signed-off-by: Rob Clark robdclark@chromium.org --- drivers/gpu/drm/msm/msm_gem.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c index 9ac89951080c..163a1d30b5c9 100644 --- a/drivers/gpu/drm/msm/msm_gem.c +++ b/drivers/gpu/drm/msm/msm_gem.c @@ -729,14 +729,16 @@ void msm_gem_purge(struct drm_gem_object *obj) struct drm_device *dev = obj->dev; struct msm_gem_object *msm_obj = to_msm_bo(obj);
+ GEM_WARN_ON(!msm_gem_is_locked(obj)); GEM_WARN_ON(!is_purgeable(msm_obj)); - GEM_WARN_ON(obj->import_attach);
/* Get rid of any iommu mapping(s): */ put_iova_spaces(obj, true);
msm_gem_vunmap(obj);
+ drm_vma_node_unmap(&obj->vma_node, dev->anon_inode->i_mapping); + put_pages(obj);
put_iova_vmas(obj); @@ -744,7 +746,6 @@ void msm_gem_purge(struct drm_gem_object *obj) msm_obj->madv = __MSM_MADV_PURGED; update_inactive(msm_obj);
- drm_vma_node_unmap(&obj->vma_node, dev->anon_inode->i_mapping); drm_gem_free_mmap_offset(obj);
/* Our goal here is to return as much of the memory as
From: Rob Clark robdclark@chromium.org
Now that tracking is wired up for potentially evictable GEM objects, wire up shrinker and the remaining GEM bits for unpinning backing pages of inactive objects.
Signed-off-by: Rob Clark robdclark@chromium.org --- drivers/gpu/drm/msm/msm_gem.c | 23 ++++++++++++++++ drivers/gpu/drm/msm/msm_gem_shrinker.c | 37 +++++++++++++++++++++++++- drivers/gpu/drm/msm/msm_gpu_trace.h | 13 +++++++++ 3 files changed, 72 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c index 163a1d30b5c9..2b731cf42294 100644 --- a/drivers/gpu/drm/msm/msm_gem.c +++ b/drivers/gpu/drm/msm/msm_gem.c @@ -759,6 +759,29 @@ void msm_gem_purge(struct drm_gem_object *obj) 0, (loff_t)-1); }
+/** + * Unpin the backing pages and make them available to be swapped out. + */ +void msm_gem_evict(struct drm_gem_object *obj) +{ + struct drm_device *dev = obj->dev; + struct msm_gem_object *msm_obj = to_msm_bo(obj); + + GEM_WARN_ON(!msm_gem_is_locked(obj)); + GEM_WARN_ON(is_unevictable(msm_obj)); + GEM_WARN_ON(!msm_obj->evictable); + GEM_WARN_ON(msm_obj->active_count); + + /* Get rid of any iommu mapping(s): */ + put_iova_spaces(obj, false); + + drm_vma_node_unmap(&obj->vma_node, dev->anon_inode->i_mapping); + + put_pages(obj); + + update_inactive(msm_obj); +} + void msm_gem_vunmap(struct drm_gem_object *obj) { struct msm_gem_object *msm_obj = to_msm_bo(obj); diff --git a/drivers/gpu/drm/msm/msm_gem_shrinker.c b/drivers/gpu/drm/msm/msm_gem_shrinker.c index 38bf919f8508..52828028b9d4 100644 --- a/drivers/gpu/drm/msm/msm_gem_shrinker.c +++ b/drivers/gpu/drm/msm/msm_gem_shrinker.c @@ -9,12 +9,26 @@ #include "msm_gpu.h" #include "msm_gpu_trace.h"
+bool enable_swap = true; +MODULE_PARM_DESC(enable_swap, "Enable swappable GEM buffers"); +module_param(enable_swap, bool, 0600); + +static bool can_swap(void) +{ + return enable_swap && get_nr_swap_pages() > 0; +} + static unsigned long msm_gem_shrinker_count(struct shrinker *shrinker, struct shrink_control *sc) { struct msm_drm_private *priv = container_of(shrinker, struct msm_drm_private, shrinker); - return priv->shrinkable_count; + unsigned count = priv->shrinkable_count; + + if (can_swap()) + count += priv->evictable_count; + + return count; }
static bool @@ -32,6 +46,17 @@ purge(struct msm_gem_object *msm_obj) return true; }
+static bool +evict(struct msm_gem_object *msm_obj) +{ + if (is_unevictable(msm_obj)) + return false; + + msm_gem_evict(&msm_obj->base); + + return true; +} + static unsigned long scan(struct msm_drm_private *priv, unsigned nr_to_scan, struct list_head *list, bool (*shrink)(struct msm_gem_object *msm_obj)) @@ -104,6 +129,16 @@ msm_gem_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc) if (freed > 0) trace_msm_gem_purge(freed << PAGE_SHIFT);
+ if (can_swap() && freed < sc->nr_to_scan) { + int evicted = scan(priv, sc->nr_to_scan - freed, + &priv->inactive_willneed, evict); + + if (evicted > 0) + trace_msm_gem_evict(evicted << PAGE_SHIFT); + + freed += evicted; + } + return (freed > 0) ? freed : SHRINK_STOP; }
diff --git a/drivers/gpu/drm/msm/msm_gpu_trace.h b/drivers/gpu/drm/msm/msm_gpu_trace.h index 03e0c2536b94..ca0b08d7875b 100644 --- a/drivers/gpu/drm/msm/msm_gpu_trace.h +++ b/drivers/gpu/drm/msm/msm_gpu_trace.h @@ -128,6 +128,19 @@ TRACE_EVENT(msm_gem_purge, );
+TRACE_EVENT(msm_gem_evict, + TP_PROTO(u32 bytes), + TP_ARGS(bytes), + TP_STRUCT__entry( + __field(u32, bytes) + ), + TP_fast_assign( + __entry->bytes = bytes; + ), + TP_printk("Evicting %u bytes", __entry->bytes) +); + + TRACE_EVENT(msm_gem_purge_vmaps, TP_PROTO(u32 unmapped), TP_ARGS(unmapped),
On Mon, Apr 05, 2021 at 10:45:23AM -0700, Rob Clark wrote:
From: Rob Clark robdclark@chromium.org
One would normally hope not to be under enough memory pressure to need to swap GEM objects to disk backed swap. But memory backed zram swap (as enabled on chromebooks, for example) can actually be quite fast and useful on devices with less RAM. On a 4GB device, opening up ~4 memory intensive web pages (in separate windows rather than tabs, to try and prevent tab discard), I see ~500MB worth of GEM objects, of which maybe only 10% are active at any time, and with unpin/evict enabled, only about half resident (which is a number that gets much lower if you simulate extreme memory pressure). Assuming a 2:1 compression ratio (I see a bit higher in practice, but cannot isolate swapped out GEM pages vs other), that is like having an extra 100+MB of RAM, or more under higher memory pressure.
Rob Clark (8): drm/msm: ratelimit GEM related WARN_ON()s drm/msm: Reorganize msm_gem_shrinker_scan() drm/msm: Clear msm_obj->sgt in put_pages() drm/msm: Split iova purge and close drm/msm: Add $debugfs/gem stats on resident objects drm/msm: Track potentially evictable objects drm/msm: Small msm_gem_purge() fix drm/msm: Support evicting GEM objects to swap
Given how much entertainement shrinkers are, should we aim for more common code here?
Christian has tons of fun with adding something like this for ttm (well different shades of grey). i915 is going to adopt ttm, at least for discrete.
The locking is also an utter pain, and msm seems to still live a lot in its own land here. I think as much as possible a standard approach here would be really good, ideally maybe as building blocks shared between ttm and gem-shmem drivers ... -Daniel
drivers/gpu/drm/msm/msm_drv.c | 2 +- drivers/gpu/drm/msm/msm_drv.h | 13 ++- drivers/gpu/drm/msm/msm_gem.c | 155 +++++++++++++++++-------- drivers/gpu/drm/msm/msm_gem.h | 68 +++++++++-- drivers/gpu/drm/msm/msm_gem_shrinker.c | 129 ++++++++++++-------- drivers/gpu/drm/msm/msm_gpu_trace.h | 13 +++ 6 files changed, 272 insertions(+), 108 deletions(-)
-- 2.30.2
dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
On Thu, Apr 8, 2021 at 4:15 AM Daniel Vetter daniel@ffwll.ch wrote:
On Mon, Apr 05, 2021 at 10:45:23AM -0700, Rob Clark wrote:
From: Rob Clark robdclark@chromium.org
One would normally hope not to be under enough memory pressure to need to swap GEM objects to disk backed swap. But memory backed zram swap (as enabled on chromebooks, for example) can actually be quite fast and useful on devices with less RAM. On a 4GB device, opening up ~4 memory intensive web pages (in separate windows rather than tabs, to try and prevent tab discard), I see ~500MB worth of GEM objects, of which maybe only 10% are active at any time, and with unpin/evict enabled, only about half resident (which is a number that gets much lower if you simulate extreme memory pressure). Assuming a 2:1 compression ratio (I see a bit higher in practice, but cannot isolate swapped out GEM pages vs other), that is like having an extra 100+MB of RAM, or more under higher memory pressure.
Rob Clark (8): drm/msm: ratelimit GEM related WARN_ON()s drm/msm: Reorganize msm_gem_shrinker_scan() drm/msm: Clear msm_obj->sgt in put_pages() drm/msm: Split iova purge and close drm/msm: Add $debugfs/gem stats on resident objects drm/msm: Track potentially evictable objects drm/msm: Small msm_gem_purge() fix drm/msm: Support evicting GEM objects to swap
Given how much entertainement shrinkers are, should we aim for more common code here?
Christian has tons of fun with adding something like this for ttm (well different shades of grey). i915 is going to adopt ttm, at least for discrete.
The locking is also an utter pain, and msm seems to still live a lot in its own land here. I think as much as possible a standard approach here would be really good, ideally maybe as building blocks shared between ttm and gem-shmem drivers ...
I don't disagree.. but also replacing the engines on an airplane mid-flight isn't a great option either.. ;-)
The hard part (esp. wrt to locking) is tracking the state of a given bo.. ie. is it active, active+purgable, inactive+purgable, inactive+unpinnable, etc. Currently the shmem helpers don't really provide anything here. If they did, I suppose they could provide some shrinker helpers as well. Unfortunately these days I barely have enough time for drm/msm, let alone bolting this onto the shmem helpers. I would recommend that if someone wanted to do this, that they look at recent drm/msm shrinker patches that I've sent (ie. make shrinker->count() lockless, and drop the locks in shrinker->scan() body.. when the system is under heavy memory pressure, you start getting shrinker called from all the threads so contention for mm_lock can be a really bad problem)
(Well, the other potential problem is that drm/msm has a lot of different possible iommu pairings across the generations, so there is some potential here to uncover exciting new bugs.. the locking at least is the same for all the generations and pretty easy to test with and without lockdep with some tests that push essentially all memory into swap)
BR, -R
-Daniel
drivers/gpu/drm/msm/msm_drv.c | 2 +- drivers/gpu/drm/msm/msm_drv.h | 13 ++- drivers/gpu/drm/msm/msm_gem.c | 155 +++++++++++++++++-------- drivers/gpu/drm/msm/msm_gem.h | 68 +++++++++-- drivers/gpu/drm/msm/msm_gem_shrinker.c | 129 ++++++++++++-------- drivers/gpu/drm/msm/msm_gpu_trace.h | 13 +++ 6 files changed, 272 insertions(+), 108 deletions(-)
-- 2.30.2
dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
-- Daniel Vetter Software Engineer, Intel Corporation http://blog.ffwll.ch
On Thu, Apr 08, 2021 at 08:23:42AM -0700, Rob Clark wrote:
On Thu, Apr 8, 2021 at 4:15 AM Daniel Vetter daniel@ffwll.ch wrote:
On Mon, Apr 05, 2021 at 10:45:23AM -0700, Rob Clark wrote:
From: Rob Clark robdclark@chromium.org
One would normally hope not to be under enough memory pressure to need to swap GEM objects to disk backed swap. But memory backed zram swap (as enabled on chromebooks, for example) can actually be quite fast and useful on devices with less RAM. On a 4GB device, opening up ~4 memory intensive web pages (in separate windows rather than tabs, to try and prevent tab discard), I see ~500MB worth of GEM objects, of which maybe only 10% are active at any time, and with unpin/evict enabled, only about half resident (which is a number that gets much lower if you simulate extreme memory pressure). Assuming a 2:1 compression ratio (I see a bit higher in practice, but cannot isolate swapped out GEM pages vs other), that is like having an extra 100+MB of RAM, or more under higher memory pressure.
Rob Clark (8): drm/msm: ratelimit GEM related WARN_ON()s drm/msm: Reorganize msm_gem_shrinker_scan() drm/msm: Clear msm_obj->sgt in put_pages() drm/msm: Split iova purge and close drm/msm: Add $debugfs/gem stats on resident objects drm/msm: Track potentially evictable objects drm/msm: Small msm_gem_purge() fix drm/msm: Support evicting GEM objects to swap
Given how much entertainement shrinkers are, should we aim for more common code here?
Christian has tons of fun with adding something like this for ttm (well different shades of grey). i915 is going to adopt ttm, at least for discrete.
The locking is also an utter pain, and msm seems to still live a lot in its own land here. I think as much as possible a standard approach here would be really good, ideally maybe as building blocks shared between ttm and gem-shmem drivers ...
I don't disagree.. but also replacing the engines on an airplane mid-flight isn't a great option either.. ;-)
The hard part (esp. wrt to locking) is tracking the state of a given bo.. ie. is it active, active+purgable, inactive+purgable, inactive+unpinnable, etc. Currently the shmem helpers don't really provide anything here. If they did, I suppose they could provide some shrinker helpers as well. Unfortunately these days I barely have enough time for drm/msm, let alone bolting this onto the shmem helpers. I would recommend that if someone wanted to do this, that they look at recent drm/msm shrinker patches that I've sent (ie. make shrinker->count() lockless, and drop the locks in shrinker->scan() body.. when the system is under heavy memory pressure, you start getting shrinker called from all the threads so contention for mm_lock can be a really bad problem)
(Well, the other potential problem is that drm/msm has a lot of different possible iommu pairings across the generations, so there is some potential here to uncover exciting new bugs.. the locking at least is the same for all the generations and pretty easy to test with and without lockdep with some tests that push essentially all memory into swap)
So what we aimed for with i915 and discrete gpu is to first align on locking with dma_resv_lock for all buffer state, plus a bunch of lru/allocator locks for lists and stuff.
And then with more aligned locking, figure out how to maybe share more code.
The trouble is that right now neither shmem helpers, nor drivers using them, are really using dma_resv_lock to protect their per-buffer state.
So yeah it's a bit an awkward situation, and I don't know myself really how to get out of it. Lack of people with tons of free time doesn't help much.
So best case I think is that every time we touch helpers or drivers locking in a big way, we check whether it's at least slightly going towards dma_resv_lock or not. And at least make sure we're not going backwards, and maybe not spin wheels at standstill.
I guess my question is, what would be good to have to make sure we at least all agree on the overall direction? -Daniel
BR, -R
-Daniel
drivers/gpu/drm/msm/msm_drv.c | 2 +- drivers/gpu/drm/msm/msm_drv.h | 13 ++- drivers/gpu/drm/msm/msm_gem.c | 155 +++++++++++++++++-------- drivers/gpu/drm/msm/msm_gem.h | 68 +++++++++-- drivers/gpu/drm/msm/msm_gem_shrinker.c | 129 ++++++++++++-------- drivers/gpu/drm/msm/msm_gpu_trace.h | 13 +++ 6 files changed, 272 insertions(+), 108 deletions(-)
-- 2.30.2
dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
-- Daniel Vetter Software Engineer, Intel Corporation http://blog.ffwll.ch
dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
On Mon, Apr 12, 2021 at 7:28 AM Daniel Vetter daniel@ffwll.ch wrote:
On Thu, Apr 08, 2021 at 08:23:42AM -0700, Rob Clark wrote:
On Thu, Apr 8, 2021 at 4:15 AM Daniel Vetter daniel@ffwll.ch wrote:
On Mon, Apr 05, 2021 at 10:45:23AM -0700, Rob Clark wrote:
From: Rob Clark robdclark@chromium.org
One would normally hope not to be under enough memory pressure to need to swap GEM objects to disk backed swap. But memory backed zram swap (as enabled on chromebooks, for example) can actually be quite fast and useful on devices with less RAM. On a 4GB device, opening up ~4 memory intensive web pages (in separate windows rather than tabs, to try and prevent tab discard), I see ~500MB worth of GEM objects, of which maybe only 10% are active at any time, and with unpin/evict enabled, only about half resident (which is a number that gets much lower if you simulate extreme memory pressure). Assuming a 2:1 compression ratio (I see a bit higher in practice, but cannot isolate swapped out GEM pages vs other), that is like having an extra 100+MB of RAM, or more under higher memory pressure.
Rob Clark (8): drm/msm: ratelimit GEM related WARN_ON()s drm/msm: Reorganize msm_gem_shrinker_scan() drm/msm: Clear msm_obj->sgt in put_pages() drm/msm: Split iova purge and close drm/msm: Add $debugfs/gem stats on resident objects drm/msm: Track potentially evictable objects drm/msm: Small msm_gem_purge() fix drm/msm: Support evicting GEM objects to swap
Given how much entertainement shrinkers are, should we aim for more common code here?
Christian has tons of fun with adding something like this for ttm (well different shades of grey). i915 is going to adopt ttm, at least for discrete.
The locking is also an utter pain, and msm seems to still live a lot in its own land here. I think as much as possible a standard approach here would be really good, ideally maybe as building blocks shared between ttm and gem-shmem drivers ...
I don't disagree.. but also replacing the engines on an airplane mid-flight isn't a great option either.. ;-)
The hard part (esp. wrt to locking) is tracking the state of a given bo.. ie. is it active, active+purgable, inactive+purgable, inactive+unpinnable, etc. Currently the shmem helpers don't really provide anything here. If they did, I suppose they could provide some shrinker helpers as well. Unfortunately these days I barely have enough time for drm/msm, let alone bolting this onto the shmem helpers. I would recommend that if someone wanted to do this, that they look at recent drm/msm shrinker patches that I've sent (ie. make shrinker->count() lockless, and drop the locks in shrinker->scan() body.. when the system is under heavy memory pressure, you start getting shrinker called from all the threads so contention for mm_lock can be a really bad problem)
(Well, the other potential problem is that drm/msm has a lot of different possible iommu pairings across the generations, so there is some potential here to uncover exciting new bugs.. the locking at least is the same for all the generations and pretty easy to test with and without lockdep with some tests that push essentially all memory into swap)
So what we aimed for with i915 and discrete gpu is to first align on locking with dma_resv_lock for all buffer state, plus a bunch of lru/allocator locks for lists and stuff.
And then with more aligned locking, figure out how to maybe share more code.
The trouble is that right now neither shmem helpers, nor drivers using them, are really using dma_resv_lock to protect their per-buffer state.
We are actually already using dma_resv_lock() since a few release cycles back.. msm_gem_lock() and friends are a wrapper around that from the migration away from using our own lock).. the mm_lock is symply protecting the lists, not the objects
So yeah it's a bit an awkward situation, and I don't know myself really how to get out of it. Lack of people with tons of free time doesn't help much.
So best case I think is that every time we touch helpers or drivers locking in a big way, we check whether it's at least slightly going towards dma_resv_lock or not. And at least make sure we're not going backwards, and maybe not spin wheels at standstill.
I guess my question is, what would be good to have to make sure we at least all agree on the overall direction?
I guess if gem_shmem users aren't already using resv lock, moving in that directly would be a good idea. Maybe it would make sense to build more object state tracking into gem_shmem helpers (ie. so you can know which buffers are active/purgable/unpinnable/etc without traversing a list of *all* gem objects).. that seems like pushing it more in the direction of being ttm-style frameworky compared to the simple helper API that it is now. But maybe that is a good thing?
BR, -R
-Daniel
BR, -R
-Daniel
drivers/gpu/drm/msm/msm_drv.c | 2 +- drivers/gpu/drm/msm/msm_drv.h | 13 ++- drivers/gpu/drm/msm/msm_gem.c | 155 +++++++++++++++++-------- drivers/gpu/drm/msm/msm_gem.h | 68 +++++++++-- drivers/gpu/drm/msm/msm_gem_shrinker.c | 129 ++++++++++++-------- drivers/gpu/drm/msm/msm_gpu_trace.h | 13 +++ 6 files changed, 272 insertions(+), 108 deletions(-)
-- 2.30.2
dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
-- Daniel Vetter Software Engineer, Intel Corporation http://blog.ffwll.ch
dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
-- Daniel Vetter Software Engineer, Intel Corporation http://blog.ffwll.ch
On Mon, Apr 12, 2021 at 08:23:33AM -0700, Rob Clark wrote:
On Mon, Apr 12, 2021 at 7:28 AM Daniel Vetter daniel@ffwll.ch wrote:
On Thu, Apr 08, 2021 at 08:23:42AM -0700, Rob Clark wrote:
On Thu, Apr 8, 2021 at 4:15 AM Daniel Vetter daniel@ffwll.ch wrote:
On Mon, Apr 05, 2021 at 10:45:23AM -0700, Rob Clark wrote:
From: Rob Clark robdclark@chromium.org
One would normally hope not to be under enough memory pressure to need to swap GEM objects to disk backed swap. But memory backed zram swap (as enabled on chromebooks, for example) can actually be quite fast and useful on devices with less RAM. On a 4GB device, opening up ~4 memory intensive web pages (in separate windows rather than tabs, to try and prevent tab discard), I see ~500MB worth of GEM objects, of which maybe only 10% are active at any time, and with unpin/evict enabled, only about half resident (which is a number that gets much lower if you simulate extreme memory pressure). Assuming a 2:1 compression ratio (I see a bit higher in practice, but cannot isolate swapped out GEM pages vs other), that is like having an extra 100+MB of RAM, or more under higher memory pressure.
Rob Clark (8): drm/msm: ratelimit GEM related WARN_ON()s drm/msm: Reorganize msm_gem_shrinker_scan() drm/msm: Clear msm_obj->sgt in put_pages() drm/msm: Split iova purge and close drm/msm: Add $debugfs/gem stats on resident objects drm/msm: Track potentially evictable objects drm/msm: Small msm_gem_purge() fix drm/msm: Support evicting GEM objects to swap
Given how much entertainement shrinkers are, should we aim for more common code here?
Christian has tons of fun with adding something like this for ttm (well different shades of grey). i915 is going to adopt ttm, at least for discrete.
The locking is also an utter pain, and msm seems to still live a lot in its own land here. I think as much as possible a standard approach here would be really good, ideally maybe as building blocks shared between ttm and gem-shmem drivers ...
I don't disagree.. but also replacing the engines on an airplane mid-flight isn't a great option either.. ;-)
The hard part (esp. wrt to locking) is tracking the state of a given bo.. ie. is it active, active+purgable, inactive+purgable, inactive+unpinnable, etc. Currently the shmem helpers don't really provide anything here. If they did, I suppose they could provide some shrinker helpers as well. Unfortunately these days I barely have enough time for drm/msm, let alone bolting this onto the shmem helpers. I would recommend that if someone wanted to do this, that they look at recent drm/msm shrinker patches that I've sent (ie. make shrinker->count() lockless, and drop the locks in shrinker->scan() body.. when the system is under heavy memory pressure, you start getting shrinker called from all the threads so contention for mm_lock can be a really bad problem)
(Well, the other potential problem is that drm/msm has a lot of different possible iommu pairings across the generations, so there is some potential here to uncover exciting new bugs.. the locking at least is the same for all the generations and pretty easy to test with and without lockdep with some tests that push essentially all memory into swap)
So what we aimed for with i915 and discrete gpu is to first align on locking with dma_resv_lock for all buffer state, plus a bunch of lru/allocator locks for lists and stuff.
And then with more aligned locking, figure out how to maybe share more code.
The trouble is that right now neither shmem helpers, nor drivers using them, are really using dma_resv_lock to protect their per-buffer state.
We are actually already using dma_resv_lock() since a few release cycles back.. msm_gem_lock() and friends are a wrapper around that from the migration away from using our own lock).. the mm_lock is symply protecting the lists, not the objects
Oh I thought there were still some warts here scanning through your series. I guess I got confused, yay :-)
So yeah it's a bit an awkward situation, and I don't know myself really how to get out of it. Lack of people with tons of free time doesn't help much.
So best case I think is that every time we touch helpers or drivers locking in a big way, we check whether it's at least slightly going towards dma_resv_lock or not. And at least make sure we're not going backwards, and maybe not spin wheels at standstill.
I guess my question is, what would be good to have to make sure we at least all agree on the overall direction?
I guess if gem_shmem users aren't already using resv lock, moving in that directly would be a good idea. Maybe it would make sense to build more object state tracking into gem_shmem helpers (ie. so you can know which buffers are active/purgable/unpinnable/etc without traversing a list of *all* gem objects).. that seems like pushing it more in the direction of being ttm-style frameworky compared to the simple helper API that it is now. But maybe that is a good thing?
Moving shmem helpers is on the todo already.
https://dri.freedesktop.org/docs/drm/gpu/todo.html#move-buffer-object-lockin...
And yes I think letting everyone reinvent their buffer locking scheme wasn't the best idea. But otoh ttm was a monolith, and before Maarten spent a lot of time pulling out dma_fence/resv and ww_mutex it really wasn't reasonable to align with the design without pulling in the entire monolith. The code improved a lot in this regard.
Also yeah I think pushing more object state into shmem helpers would probably be good, but ideally not on the current locking ... -Daniel
BR, -R
-Daniel
BR, -R
-Daniel
drivers/gpu/drm/msm/msm_drv.c | 2 +- drivers/gpu/drm/msm/msm_drv.h | 13 ++- drivers/gpu/drm/msm/msm_gem.c | 155 +++++++++++++++++-------- drivers/gpu/drm/msm/msm_gem.h | 68 +++++++++-- drivers/gpu/drm/msm/msm_gem_shrinker.c | 129 ++++++++++++-------- drivers/gpu/drm/msm/msm_gpu_trace.h | 13 +++ 6 files changed, 272 insertions(+), 108 deletions(-)
-- 2.30.2
dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
-- Daniel Vetter Software Engineer, Intel Corporation http://blog.ffwll.ch
dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
-- Daniel Vetter Software Engineer, Intel Corporation http://blog.ffwll.ch
dri-devel mailing list dri-devel@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/dri-devel
dri-devel@lists.freedesktop.org