[PATCH v3] drm/sched: Fix kernel NULL pointer dereference error
Christian König
christian.koenig at amd.com
Tue Oct 18 12:34:55 UTC 2022
Am 18.10.22 um 14:20 schrieb Yadav, Arvind:
> [SNIP]
>>
>>> + drm_sched_fence_finished(s_fence);
>>> + dma_fence_put(&s_fence->finished);
>>> + wake_up_interruptible(&sched->wake_up_worker);
>>> +}
>>> +
>>> +int drm_sched_fence_add_parent_cb(struct dma_fence *fence,
>>> + struct drm_sched_fence *s_fence)
>>> +{
>>> + return dma_fence_add_callback(fence, &s_fence->cb,
>>> + drm_sched_job_done_cb);
>>> +}
>>> +
>>> +bool drm_sched_fence_remove_parent_cb(struct drm_sched_fence *s_fence)
>>> +{
>>> + return dma_fence_remove_callback(s_fence->parent,
>>> + &s_fence->cb);
>>> +}
>>
>> Do we really need separate functions for that?
>>
> We can use 'drm_sched_fence_set_parent' but we need to add extra NULL
> check before
>
> adding in the callback otherwise add-callback will throw the warning
> message every time.
>
> If I add NULL check then will never get any callback warning message
> for setting NULL parent fence.
>
> So I have kept separate functions.
I rather prefer having a single function and allowing the parent fence
to be set to NULL.
Alternatively we could have a drm_sched_fence_set_parent() and
drm_sched_fence_clear_parent() function if you really think it's cleaner
that way.
>>> atomic_dec(&sched->hw_rq_count);
>>> @@ -576,15 +562,14 @@ void drm_sched_start(struct drm_gpu_scheduler
>>> *sched, bool full_recovery)
>>> continue;
>>> if (fence) {
>>> - r = dma_fence_add_callback(fence, &s_job->cb,
>>> - drm_sched_job_done_cb);
>>> + r = drm_sched_fence_add_parent_cb(fence, s_job->s_fence);
>>> if (r == -ENOENT)
>>> - drm_sched_job_done(s_job);
>>> + drm_sched_job_done(s_job->s_fence);
>>> else if (r)
>>> DRM_DEV_ERROR(sched->dev, "fence add callback
>>> failed (%d)\n",
>>
>> Completely nuke that here. All of this should be done in the single
>> drm_sched_fence_set_parent() function.
>>
>> And an error message is completely superfluous. We just need to
>> handle the case that the callback can't be installed because the
>> fence is already signaled.
>>
> I will do the changes as per your review comments, Thank you for the
> review.
Just to clarify, you should nuke the error message. Error handling is
rather pointless here.
Thanks,
Christian.
>
> Thanks,
>
> ~Arvind
>
>> Regards,
>> Christian.
>>
>>> r);
>>> } else
>>> - drm_sched_job_done(s_job);
>>> + drm_sched_job_done(s_job->s_fence);
>>> }
>>> if (full_recovery) {
>>> @@ -1049,14 +1034,9 @@ static int drm_sched_main(void *param)
>>> drm_sched_fence_scheduled(s_fence);
>>> if (!IS_ERR_OR_NULL(fence)) {
>>> - s_fence->parent = dma_fence_get(fence);
>>> - /* Drop for original kref_init of the fence */
>>> - dma_fence_put(fence);
>>> -
>>> - r = dma_fence_add_callback(fence, &sched_job->cb,
>>> - drm_sched_job_done_cb);
>>> + r = drm_sched_fence_set_parent(fence, s_fence);
>>> if (r == -ENOENT)
>>> - drm_sched_job_done(sched_job);
>>> + drm_sched_job_done(s_fence);
>>> else if (r)
>>> DRM_DEV_ERROR(sched->dev, "fence add callback
>>> failed (%d)\n",
>>> r);
>>> @@ -1064,7 +1044,7 @@ static int drm_sched_main(void *param)
>>> if (IS_ERR(fence))
>>> dma_fence_set_error(&s_fence->finished, PTR_ERR(fence));
>>> - drm_sched_job_done(sched_job);
>>> + drm_sched_job_done(s_fence);
>>> }
>>> wake_up(&sched->job_scheduled);
>>> diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
>>> index 1f7d9dd1a444..7258e2fa195f 100644
>>> --- a/include/drm/gpu_scheduler.h
>>> +++ b/include/drm/gpu_scheduler.h
>>> @@ -281,6 +281,10 @@ struct drm_sched_fence {
>>> * @owner: job owner for debugging
>>> */
>>> void *owner;
>>> + /**
>>> + * @cb: callback
>>> + */
>>> + struct dma_fence_cb cb;
>>> };
>>> struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f);
>>> @@ -300,7 +304,6 @@ struct drm_sched_fence
>>> *to_drm_sched_fence(struct dma_fence *f);
>>> * be scheduled further.
>>> * @s_priority: the priority of the job.
>>> * @entity: the entity to which this job belongs.
>>> - * @cb: the callback for the parent fence in s_fence.
>>> *
>>> * A job is created by the driver using drm_sched_job_init(), and
>>> * should call drm_sched_entity_push_job() once it wants the
>>> scheduler
>>> @@ -325,7 +328,6 @@ struct drm_sched_job {
>>> atomic_t karma;
>>> enum drm_sched_priority s_priority;
>>> struct drm_sched_entity *entity;
>>> - struct dma_fence_cb cb;
>>> /**
>>> * @dependencies:
>>> *
>>> @@ -559,6 +561,12 @@ void drm_sched_fence_free(struct
>>> drm_sched_fence *fence);
>>> void drm_sched_fence_scheduled(struct drm_sched_fence *fence);
>>> void drm_sched_fence_finished(struct drm_sched_fence *fence);
>>> +int drm_sched_fence_add_parent_cb(struct dma_fence *fence,
>>> + struct drm_sched_fence *s_fence);
>>> +bool drm_sched_fence_remove_parent_cb(struct drm_sched_fence
>>> *s_fence);
>>> +int drm_sched_fence_set_parent(struct dma_fence *fence,
>>> + struct drm_sched_fence *s_fence);
>>> +
>>> unsigned long drm_sched_suspend_timeout(struct drm_gpu_scheduler
>>> *sched);
>>> void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched,
>>> unsigned long remaining);
>>
More information about the dri-devel
mailing list