[RFC PATCH] drm/sched: Make sure drm_sched_fence_ops don't vanish
Christian König
christian.koenig at amd.com
Mon Sep 2 10:58:54 UTC 2024
Am 31.08.24 um 00:28 schrieb Matthew Brost:
> On Fri, Aug 30, 2024 at 12:40:57PM +0200, Boris Brezillon wrote:
>> dma_fence objects created by drm_sched might hit other subsystems, which
>> means the fence object might potentially outlive the drm_sched module
>> lifetime, and at this point the dma_fence::ops points to a memory region
>> that no longer exists.
>>
>> In order to fix that, let's make sure the drm_sched_fence code is always
>> statically linked, just like dma-fence{-chain}.c does.
>>
>> Cc: Luben Tuikov <ltuikov89 at gmail.com>
>> Cc: Matthew Brost <matthew.brost at intel.com>
>> Cc: "Christian König" <christian.koenig at amd.com>
>> Cc: Danilo Krummrich <dakr at redhat.com>
>> Signed-off-by: Boris Brezillon <boris.brezillon at collabora.com>
>> ---
>> Just like for the other UAF fix, this is an RFC, as I'm not sure that's
>> how we want it fixed. The other option we have is to retain a module ref
>> for each initialized fence and release it when the fence is freed.
> So this is different than your other patch. From Xe PoV the other patch
> is referencing an object which is tied to an IOCTL rather than a module
> whereas this referencing a module. If a module can disappear when fence
> tied to the module, well that is a bit scary and Xe might have some
> issues there - I'd have audit our of exporting internally created
> fences.
>
> Based on Christian's feedback we really shouldn't be allowing this but
> also don't really love the idea of a fence holding a module ref either.
IIRC the initial proposal for dma_fences actually contained grabbing a
module reference, similar to what we do for dma-bufs.
But I think that was dropped because of the circle dependency issues and
preventing module unload. After that nobody really looked into it again.
I think using the scheduler fence to decouple the hardware fence
lifetime should work. We would just need to drop the hardware fence
reference after the scheduler fence signaled and not when it is destroyed.
This unfortunately also creates another problems for error recovery,
but that is a different topic I think.
> Seems like we need a well defined + documented rule - exported fences
> need to be completely decoupled from the module upon signaling or
> exported fences must retain a module ref. I'd probably lean towards the
> former. One reason to support the former is fences can be released in
> IRQ contexts and dropping a module ref in IRQ context seems a bit
> problematic. Also because some oher part of kernel holds on to fence ref
> lazily block a module unload just seems wrong.
Modules are not auto unloaded when their reference count becomes zero.
Only when rmmod (or the corresponding system call) is issued.
So dropping a module reference from interrupt context should be
unproblematic I think. But we should probably double check.
Fully decoupling fence destruction from the module is most likely not
possible since we will always need the free callback from the ops for
some use cases.
> Sorry if above we have well defined rule and I'm just not aware.
No, it's basically just a well known mess nobody cared much about.
Regards,
Christian.
>
> Matt
>
>> ---
>> drivers/gpu/drm/Makefile | 2 +-
>> drivers/gpu/drm/scheduler/Makefile | 7 ++++++-
>> drivers/gpu/drm/scheduler/sched_fence.c | 21 +++++++++------------
>> drivers/gpu/drm/scheduler/sched_main.c | 3 +++
>> 4 files changed, 19 insertions(+), 14 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/Makefile b/drivers/gpu/drm/Makefile
>> index 68cc9258ffc4..b97127da58b7 100644
>> --- a/drivers/gpu/drm/Makefile
>> +++ b/drivers/gpu/drm/Makefile
>> @@ -158,7 +158,7 @@ obj-$(CONFIG_DRM_MIPI_DSI) += drm_mipi_dsi.o
>> obj-y += arm/
>> obj-y += display/
>> obj-$(CONFIG_DRM_TTM) += ttm/
>> -obj-$(CONFIG_DRM_SCHED) += scheduler/
>> +obj-y += scheduler/
>> obj-$(CONFIG_DRM_RADEON)+= radeon/
>> obj-$(CONFIG_DRM_AMDGPU)+= amd/amdgpu/
>> obj-$(CONFIG_DRM_AMDGPU)+= amd/amdxcp/
>> diff --git a/drivers/gpu/drm/scheduler/Makefile b/drivers/gpu/drm/scheduler/Makefile
>> index 53863621829f..bc18d26f27a1 100644
>> --- a/drivers/gpu/drm/scheduler/Makefile
>> +++ b/drivers/gpu/drm/scheduler/Makefile
>> @@ -20,6 +20,11 @@
>> # OTHER DEALINGS IN THE SOFTWARE.
>> #
>> #
>> -gpu-sched-y := sched_main.o sched_fence.o sched_entity.o
>> +
>> +# sched_fence.c contains dma_fence_ops that can't go away, so make sure it's
>> +# statically linked when CONFIG_DRM_SCHED=m
>> +obj-y += $(if $(CONFIG_DRM_SCHED),sched_fence.o,)
>> +
>> +gpu-sched-y := sched_main.o sched_entity.o
>>
>> obj-$(CONFIG_DRM_SCHED) += gpu-sched.o
>> diff --git a/drivers/gpu/drm/scheduler/sched_fence.c b/drivers/gpu/drm/scheduler/sched_fence.c
>> index efa2a71d98eb..ac65589476dd 100644
>> --- a/drivers/gpu/drm/scheduler/sched_fence.c
>> +++ b/drivers/gpu/drm/scheduler/sched_fence.c
>> @@ -39,12 +39,7 @@ static int __init drm_sched_fence_slab_init(void)
>>
>> return 0;
>> }
>> -
>> -static void __exit drm_sched_fence_slab_fini(void)
>> -{
>> - rcu_barrier();
>> - kmem_cache_destroy(sched_fence_slab);
>> -}
>> +subsys_initcall(drm_sched_fence_slab_init);
>>
>> static void drm_sched_fence_set_parent(struct drm_sched_fence *s_fence,
>> struct dma_fence *fence)
>> @@ -74,6 +69,7 @@ void drm_sched_fence_scheduled(struct drm_sched_fence *fence,
>>
>> dma_fence_signal(&fence->scheduled);
>> }
>> +EXPORT_SYMBOL(drm_sched_fence_scheduled);
>>
>> void drm_sched_fence_finished(struct drm_sched_fence *fence, int result)
>> {
>> @@ -81,6 +77,7 @@ void drm_sched_fence_finished(struct drm_sched_fence *fence, int result)
>> dma_fence_set_error(&fence->finished, result);
>> dma_fence_signal(&fence->finished);
>> }
>> +EXPORT_SYMBOL(drm_sched_fence_finished);
>>
>> static const char *drm_sched_fence_get_driver_name(struct dma_fence *fence)
>> {
>> @@ -118,6 +115,7 @@ void drm_sched_fence_free(struct drm_sched_fence *fence)
>> if (!WARN_ON_ONCE(fence->timeline))
>> kmem_cache_free(sched_fence_slab, fence);
>> }
>> +EXPORT_SYMBOL(drm_sched_fence_free);
>>
>> /**
>> * drm_sched_fence_release_scheduled - callback that fence can be freed
>> @@ -210,6 +208,9 @@ struct drm_sched_fence *drm_sched_fence_alloc(struct drm_sched_entity *entity,
>> {
>> struct drm_sched_fence *fence = NULL;
>>
>> + if (unlikely(!sched_fence_slab))
>> + return NULL;
>> +
>> fence = kmem_cache_zalloc(sched_fence_slab, GFP_KERNEL);
>> if (fence == NULL)
>> return NULL;
>> @@ -219,6 +220,7 @@ struct drm_sched_fence *drm_sched_fence_alloc(struct drm_sched_entity *entity,
>>
>> return fence;
>> }
>> +EXPORT_SYMBOL(drm_sched_fence_alloc);
>>
>> void drm_sched_fence_init(struct drm_sched_fence *fence,
>> struct drm_sched_entity *entity)
>> @@ -234,9 +236,4 @@ void drm_sched_fence_init(struct drm_sched_fence *fence,
>> dma_fence_init(&fence->finished, &drm_sched_fence_ops_finished,
>> &fence->lock, entity->fence_context + 1, seq);
>> }
>> -
>> -module_init(drm_sched_fence_slab_init);
>> -module_exit(drm_sched_fence_slab_fini);
>> -
>> -MODULE_DESCRIPTION("DRM GPU scheduler");
>> -MODULE_LICENSE("GPL and additional rights");
>> +EXPORT_SYMBOL(drm_sched_fence_init);
>> diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c
>> index 1e31a9c8ce15..eaaf086eab30 100644
>> --- a/drivers/gpu/drm/scheduler/sched_main.c
>> +++ b/drivers/gpu/drm/scheduler/sched_main.c
>> @@ -1467,3 +1467,6 @@ void drm_sched_wqueue_start(struct drm_gpu_scheduler *sched)
>> queue_work(sched->submit_wq, &sched->work_free_job);
>> }
>> EXPORT_SYMBOL(drm_sched_wqueue_start);
>> +
>> +MODULE_DESCRIPTION("DRM GPU scheduler");
>> +MODULE_LICENSE("GPL and additional rights");
>> --
>> 2.46.0
>>
More information about the dri-devel
mailing list