[RFC PATCH] drm/xe: fix memory leaks on vm_bind_ioctl failure paths

Zanoni, Paulo R paulo.r.zanoni at intel.com
Fri Sep 27 23:37:42 UTC 2024


On Fri, 2024-09-27 at 23:14 +0000, Matthew Brost wrote:
> On Thu, Sep 26, 2024 at 05:36:33PM -0700, Paulo Zanoni wrote:
> > Caught by kmemleak when running Vulkan CTS tests on LNL. The leak
> > seems to happen only when there's some kind of failure happening, like
> > the lack of memory. Example output:
> > 
> 
> So I think this patch is correct that it will a memory leak. I'm going
> to fix it a slightly different way though. It would be great if you
> could tests this out.

I'll see if I can test it Monday. Thanks!

> 
> I doubt this would cause any major problems though as just leak a few
> pages here and there.
> 
> > unreferenced object 0xffff9120bdf62000 (size 8192):
> >   comm "deqp-vk", pid 115008, jiffies 4310295728
> >   hex dump (first 32 bytes):
> >     00 00 00 00 00 00 00 00 1b 05 f9 28 01 00 00 40  ...........(...@
> >     00 00 00 00 00 00 00 00 1b 15 f9 28 01 00 00 40  ...........(...@
> >   backtrace (crc 7a56be79):
> >     [<ffffffff86dd81f0>] __kmalloc_cache_noprof+0x310/0x3d0
> >     [<ffffffffc08e8211>] xe_pt_new_shared.constprop.0+0x81/0xb0 [xe]
> >     [<ffffffffc08e8309>] xe_pt_insert_entry+0xb9/0x140 [xe]
> >     [<ffffffffc08eab6d>] xe_pt_stage_bind_entry+0x12d/0x5b0 [xe]
> >     [<ffffffffc08ecbca>] xe_pt_walk_range+0xea/0x280 [xe]
> >     [<ffffffffc08eccea>] xe_pt_walk_range+0x20a/0x280 [xe]
> >     [<ffffffffc08eccea>] xe_pt_walk_range+0x20a/0x280 [xe]
> >     [<ffffffffc08eccea>] xe_pt_walk_range+0x20a/0x280 [xe]
> >     [<ffffffffc08eccea>] xe_pt_walk_range+0x20a/0x280 [xe]
> >     [<ffffffffc08e9eff>] xe_pt_stage_bind.constprop.0+0x25f/0x580 [xe]
> >     [<ffffffffc08eb21a>] bind_op_prepare+0xea/0x6e0 [xe]
> >     [<ffffffffc08ebab8>] xe_pt_update_ops_prepare+0x1c8/0x440 [xe]
> >     [<ffffffffc08ffbf3>] ops_execute+0x143/0x850 [xe]
> >     [<ffffffffc0900b64>] vm_bind_ioctl_ops_execute+0x244/0x800 [xe]
> >     [<ffffffffc0906467>] xe_vm_bind_ioctl+0x1877/0x2370 [xe]
> >     [<ffffffffc05e92b3>] drm_ioctl_kernel+0xb3/0x110 [drm]
> > unreferenced object 0xffff9120bdf72000 (size 8192):
> >   comm "deqp-vk", pid 115008, jiffies 4310295728
> >   hex dump (first 32 bytes):
> >     6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  kkkkkkkkkkkkkkkk
> >     6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  kkkkkkkkkkkkkkkk
> >   backtrace (crc 23b2f0b5):
> >     [<ffffffff86dd81f0>] __kmalloc_cache_noprof+0x310/0x3d0
> >     [<ffffffffc08e8211>] xe_pt_new_shared.constprop.0+0x81/0xb0 [xe]
> >     [<ffffffffc08e8453>] xe_pt_stage_unbind_post_descend+0xb3/0x150 [xe]
> >     [<ffffffffc08ecd26>] xe_pt_walk_range+0x246/0x280 [xe]
> >     [<ffffffffc08eccea>] xe_pt_walk_range+0x20a/0x280 [xe]
> >     [<ffffffffc08eccea>] xe_pt_walk_range+0x20a/0x280 [xe]
> >     [<ffffffffc08eccea>] xe_pt_walk_range+0x20a/0x280 [xe]
> >     [<ffffffffc08ece31>] xe_pt_walk_shared+0xc1/0x110 [xe]
> >     [<ffffffffc08e7b2a>] xe_pt_stage_unbind+0x9a/0xd0 [xe]
> >     [<ffffffffc08e913d>] unbind_op_prepare+0xdd/0x270 [xe]
> >     [<ffffffffc08eb9f6>] xe_pt_update_ops_prepare+0x106/0x440 [xe]
> >     [<ffffffffc08ffbf3>] ops_execute+0x143/0x850 [xe]
> >     [<ffffffffc0900b64>] vm_bind_ioctl_ops_execute+0x244/0x800 [xe]
> >     [<ffffffffc0906467>] xe_vm_bind_ioctl+0x1877/0x2370 [xe]
> >     [<ffffffffc05e92b3>] drm_ioctl_kernel+0xb3/0x110 [drm]
> >     [<ffffffffc05e95a0>] drm_ioctl+0x280/0x4e0 [drm]
> > 
> 
> How did you get this output? I used a couple kernel tools but I wasn't
> able to get output like this.

sudo cat /sys/kernel/debug/kmemleak

(but you need to have kmemleak enabled, of course)

> 
> Matt
> 
> > Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/2877
> > Signed-off-by: Paulo Zanoni <paulo.r.zanoni at intel.com>
> > ---
> >  drivers/gpu/drm/xe/xe_pt.c | 42 +++++++++++++++++++++++---------------
> >  1 file changed, 25 insertions(+), 17 deletions(-)
> > 
> > PS: I'm not super confident that this is the right place to do this
> > since I'm not too familiar with this code and pointer ownership
> > changes a lot here. Still, this patch seems to work on my system, so
> > it may serve as a good conversation starter.
> > 
> > diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c
> > index d6353e8969f0..212f7de645df 100644
> > --- a/drivers/gpu/drm/xe/xe_pt.c
> > +++ b/drivers/gpu/drm/xe/xe_pt.c
> > @@ -921,18 +921,23 @@ static void xe_pt_abort_bind(struct xe_vma *vma,
> >  		if (!rebind)
> >  			pt->num_live -= entries[i].qwords;
> >  
> > -		if (!pt->level)
> > -			continue;
> > -
> > -		pt_dir = as_xe_pt_dir(pt);
> > -		for (j = 0; j < entries[i].qwords; j++) {
> > -			u32 j_ = j + entries[i].ofs;
> > -			struct xe_pt *newpte = xe_pt_entry(pt_dir, j_);
> > -			struct xe_pt *oldpte = entries[i].pt_entries[j].pt;
> > -
> > -			pt_dir->children[j_] = oldpte ? &oldpte->base : 0;
> > -			xe_pt_destroy(newpte, xe_vma_vm(vma)->flags, NULL);
> > +		if (pt->level) {
> > +			pt_dir = as_xe_pt_dir(pt);
> > +			for (j = 0; j < entries[i].qwords; j++) {
> > +				u32 j_ = j + entries[i].ofs;
> > +				struct xe_pt *newpte = xe_pt_entry(pt_dir, j_);
> > +				struct xe_pt *oldpte =
> > +					entries[i].pt_entries[j].pt;
> > +
> > +				pt_dir->children[j_] = oldpte ?
> > +					&oldpte->base : 0;
> > +				xe_pt_destroy(newpte, xe_vma_vm(vma)->flags,
> > +					      NULL);
> > +			}
> >  		}
> > +
> > +		if (entries[i].pt_entries)
> > +			kfree(entries[i].pt_entries);
> >  	}
> >  }
> >  
> > @@ -1564,13 +1569,16 @@ static void xe_pt_abort_unbind(struct xe_vma *vma,
> >  
> >  		pt->num_live += entry->qwords;
> >  
> > -		if (!pt->level)
> > -			continue;
> > +		if (pt->level) {
> > +			for (j = entry->ofs; j < entry->ofs + entry->qwords; j++)
> > +				pt_dir->children[j] =
> > +					entries[i].pt_entries[j - entry->ofs].pt ?
> > +						&entries[i].pt_entries[j - entry->ofs].pt->base :
> > +						NULL;
> > +		}
> >  
> > -		for (j = entry->ofs; j < entry->ofs + entry->qwords; j++)
> > -			pt_dir->children[j] =
> > -				entries[i].pt_entries[j - entry->ofs].pt ?
> > -				&entries[i].pt_entries[j - entry->ofs].pt->base : NULL;
> > +		if (entry->pt_entries)
> > +			kfree(entry->pt_entries);
> >  	}
> >  }
> >  
> > -- 
> > 2.45.2
> > 



More information about the Intel-xe mailing list