[PATCH 01/27] drm/amdgpu: switch job hw_fence to amdgpu_fence
Christian König
christian.koenig at amd.com
Mon Jun 16 17:45:32 UTC 2025
On 6/16/25 15:47, Alex Deucher wrote:
> On Mon, Jun 16, 2025 at 8:16 AM Christian König
> <christian.koenig at amd.com> wrote:
>>
>> On 6/13/25 23:47, Alex Deucher wrote:
>>> Use the amdgpu fence container so we can store additional
>>> data in the fence. This also fixes the start_time handling
>>> for MCBP since we were casting the fence to an amdgpu_fence
>>> and it wasn't.
>>>
>>> Fixes: 3f4c175d62d8 ("drm/amdgpu: MCBP based on DRM scheduler (v9)")
>>> Signed-off-by: Alex Deucher <alexander.deucher at amd.com>
>>> ---
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 2 +-
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +-
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 30 +++++----------------
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 12 ++++-----
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_job.h | 2 +-
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 16 +++++++++++
>>> 6 files changed, 32 insertions(+), 32 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> index 8e626f50b362e..f81608330a3d0 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>> @@ -1902,7 +1902,7 @@ static void amdgpu_ib_preempt_mark_partial_job(struct amdgpu_ring *ring)
>>> continue;
>>> }
>>> job = to_amdgpu_job(s_job);
>>> - if (preempted && (&job->hw_fence) == fence)
>>> + if (preempted && (&job->hw_fence.base) == fence)
>>> /* mark the job as preempted */
>>> job->preemption_status |= AMDGPU_IB_PREEMPTED;
>>> }
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index 00174437b01ec..4893f834f4fd4 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -6397,7 +6397,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>>> *
>>> * job->base holds a reference to parent fence
>>> */
>>> - if (job && dma_fence_is_signaled(&job->hw_fence)) {
>>> + if (job && dma_fence_is_signaled(&job->hw_fence.base)) {
>>> job_signaled = true;
>>> dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
>>> goto skip_hw_reset;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>> index 2f24a6aa13bf6..569e0e5373927 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
>>> @@ -41,22 +41,6 @@
>>> #include "amdgpu_trace.h"
>>> #include "amdgpu_reset.h"
>>>
>>> -/*
>>> - * Fences mark an event in the GPUs pipeline and are used
>>> - * for GPU/CPU synchronization. When the fence is written,
>>> - * it is expected that all buffers associated with that fence
>>> - * are no longer in use by the associated ring on the GPU and
>>> - * that the relevant GPU caches have been flushed.
>>> - */
>>> -
>>> -struct amdgpu_fence {
>>> - struct dma_fence base;
>>> -
>>> - /* RB, DMA, etc. */
>>> - struct amdgpu_ring *ring;
>>> - ktime_t start_timestamp;
>>> -};
>>> -
>>> static struct kmem_cache *amdgpu_fence_slab;
>>>
>>> int amdgpu_fence_slab_init(void)
>>> @@ -151,12 +135,12 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f, struct amd
>>> am_fence = kmem_cache_alloc(amdgpu_fence_slab, GFP_ATOMIC);
>>> if (am_fence == NULL)
>>> return -ENOMEM;
>>> - fence = &am_fence->base;
>>> - am_fence->ring = ring;
>>> } else {
>>> /* take use of job-embedded fence */
>>> - fence = &job->hw_fence;
>>> + am_fence = &job->hw_fence;
>>> }
>>> + fence = &am_fence->base;
>>> + am_fence->ring = ring;
>>
>> I would rather completely drop the job from the parameters and the general fence allocation here.
>>
>> Instead we should just provide afence as input parameter and submit that one.
>>
>> This should make sure that we don't run into such issues again.
>
> How about doing that as a follow on patch? It looks like that will be
> a much bigger patch for a stable bug fix. I think we can clean up a
> lot of stuff in amdgpu_fence.c with that change.
Works for me. I would also suggest to remove the kmem_cache_alloc() and just use kmalloc for the rare cases where we need an independent fence.
Additional to that the ring and start_time member looks suspicious. We should not have that inside the fence in the first place.
Regards,
Christian.
>
> Alex
>
>>
>> Apart from that looks good to me,
>> Christian.
>>
>>>
>>> seq = ++ring->fence_drv.sync_seq;
>>> if (job && job->job_run_counter) {
>>> @@ -718,7 +702,7 @@ void amdgpu_fence_driver_clear_job_fences(struct amdgpu_ring *ring)
>>> * it right here or we won't be able to track them in fence_drv
>>> * and they will remain unsignaled during sa_bo free.
>>> */
>>> - job = container_of(old, struct amdgpu_job, hw_fence);
>>> + job = container_of(old, struct amdgpu_job, hw_fence.base);
>>> if (!job->base.s_fence && !dma_fence_is_signaled(old))
>>> dma_fence_signal(old);
>>> RCU_INIT_POINTER(*ptr, NULL);
>>> @@ -780,7 +764,7 @@ static const char *amdgpu_fence_get_timeline_name(struct dma_fence *f)
>>>
>>> static const char *amdgpu_job_fence_get_timeline_name(struct dma_fence *f)
>>> {
>>> - struct amdgpu_job *job = container_of(f, struct amdgpu_job, hw_fence);
>>> + struct amdgpu_job *job = container_of(f, struct amdgpu_job, hw_fence.base);
>>>
>>> return (const char *)to_amdgpu_ring(job->base.sched)->name;
>>> }
>>> @@ -810,7 +794,7 @@ static bool amdgpu_fence_enable_signaling(struct dma_fence *f)
>>> */
>>> static bool amdgpu_job_fence_enable_signaling(struct dma_fence *f)
>>> {
>>> - struct amdgpu_job *job = container_of(f, struct amdgpu_job, hw_fence);
>>> + struct amdgpu_job *job = container_of(f, struct amdgpu_job, hw_fence.base);
>>>
>>> if (!timer_pending(&to_amdgpu_ring(job->base.sched)->fence_drv.fallback_timer))
>>> amdgpu_fence_schedule_fallback(to_amdgpu_ring(job->base.sched));
>>> @@ -845,7 +829,7 @@ static void amdgpu_job_fence_free(struct rcu_head *rcu)
>>> struct dma_fence *f = container_of(rcu, struct dma_fence, rcu);
>>>
>>> /* free job if fence has a parent job */
>>> - kfree(container_of(f, struct amdgpu_job, hw_fence));
>>> + kfree(container_of(f, struct amdgpu_job, hw_fence.base));
>>> }
>>>
>>> /**
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> index acb21fc8b3ce5..ddb9d3269357c 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> @@ -272,8 +272,8 @@ void amdgpu_job_free_resources(struct amdgpu_job *job)
>>> /* Check if any fences where initialized */
>>> if (job->base.s_fence && job->base.s_fence->finished.ops)
>>> f = &job->base.s_fence->finished;
>>> - else if (job->hw_fence.ops)
>>> - f = &job->hw_fence;
>>> + else if (job->hw_fence.base.ops)
>>> + f = &job->hw_fence.base;
>>> else
>>> f = NULL;
>>>
>>> @@ -290,10 +290,10 @@ static void amdgpu_job_free_cb(struct drm_sched_job *s_job)
>>> amdgpu_sync_free(&job->explicit_sync);
>>>
>>> /* only put the hw fence if has embedded fence */
>>> - if (!job->hw_fence.ops)
>>> + if (!job->hw_fence.base.ops)
>>> kfree(job);
>>> else
>>> - dma_fence_put(&job->hw_fence);
>>> + dma_fence_put(&job->hw_fence.base);
>>> }
>>>
>>> void amdgpu_job_set_gang_leader(struct amdgpu_job *job,
>>> @@ -322,10 +322,10 @@ void amdgpu_job_free(struct amdgpu_job *job)
>>> if (job->gang_submit != &job->base.s_fence->scheduled)
>>> dma_fence_put(job->gang_submit);
>>>
>>> - if (!job->hw_fence.ops)
>>> + if (!job->hw_fence.base.ops)
>>> kfree(job);
>>> else
>>> - dma_fence_put(&job->hw_fence);
>>> + dma_fence_put(&job->hw_fence.base);
>>> }
>>>
>>> struct dma_fence *amdgpu_job_submit(struct amdgpu_job *job)
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> index f2c049129661f..931fed8892cc1 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> @@ -48,7 +48,7 @@ struct amdgpu_job {
>>> struct drm_sched_job base;
>>> struct amdgpu_vm *vm;
>>> struct amdgpu_sync explicit_sync;
>>> - struct dma_fence hw_fence;
>>> + struct amdgpu_fence hw_fence;
>>> struct dma_fence *gang_submit;
>>> uint32_t preamble_status;
>>> uint32_t preemption_status;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> index b95b471107692..e1f25218943a4 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
>>> @@ -127,6 +127,22 @@ struct amdgpu_fence_driver {
>>> struct dma_fence **fences;
>>> };
>>>
>>> +/*
>>> + * Fences mark an event in the GPUs pipeline and are used
>>> + * for GPU/CPU synchronization. When the fence is written,
>>> + * it is expected that all buffers associated with that fence
>>> + * are no longer in use by the associated ring on the GPU and
>>> + * that the relevant GPU caches have been flushed.
>>> + */
>>> +
>>> +struct amdgpu_fence {
>>> + struct dma_fence base;
>>> +
>>> + /* RB, DMA, etc. */
>>> + struct amdgpu_ring *ring;
>>> + ktime_t start_timestamp;
>>> +};
>>> +
>>> extern const struct drm_sched_backend_ops amdgpu_sched_ops;
>>>
>>> void amdgpu_fence_driver_clear_job_fences(struct amdgpu_ring *ring);
>>
More information about the amd-gfx
mailing list