[PATCH 4/4] drm/amdgpu: prevent gpu access during reset recovery
Christian König
christian.koenig at amd.com
Thu May 23 07:16:23 UTC 2024
Am 22.05.24 um 19:27 schrieb Yunxiang Li:
> Random accesses to the GPU while it is not re-initialized can lead to a
> bad time. So add a rwsem to prevent such accesses. Normal accesses will
> now take the read lock for shared GPU access, reset takes the write lock
> for exclusive GPU access.
>
> Care need to be taken so that the recovery thread does not take the read
> lock and deadlock itself, and normal access should avoid waiting on the
> reset to finish and should instead treat the hardware access as failed.
>
> Signed-off-by: Yunxiang Li <Yunxiang.Li at amd.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 5 ++
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 22 ++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 74 ++++++++++---------
> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 1 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 1 +
> drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 7 +-
> drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 7 +-
> .../drm/amd/amdkfd/kfd_device_queue_manager.c | 3 +-
> .../amd/amdkfd/kfd_process_queue_manager.c | 8 +-
> 9 files changed, 79 insertions(+), 49 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 1f71c7b98d77..5a089e2dec2a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1632,6 +1632,11 @@ static inline bool amdgpu_is_tmz(struct amdgpu_device *adev)
>
> int amdgpu_in_reset(struct amdgpu_device *adev);
>
> +void amdgpu_lock_hw_access(struct amdgpu_device *adev);
> +void amdgpu_unlock_hw_access(struct amdgpu_device *adev);
> +int amdgpu_begin_hw_access(struct amdgpu_device *adev);
> +void amdgpu_end_hw_access(struct amdgpu_device *adev);
> +
Don't add anything to amdgpu.h. We are slowly decomposing that file.
> extern const struct attribute_group amdgpu_vram_mgr_attr_group;
> extern const struct attribute_group amdgpu_gtt_mgr_attr_group;
> extern const struct attribute_group amdgpu_flash_attr_group;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index e74789691070..057d735c7cae 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -5816,6 +5816,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
> goto skip_hw_reset;
> }
>
> + amdgpu_lock_hw_access(adev);
That should already be locked here. So this will most likely deadlock.
> retry: /* Rest of adevs pre asic reset from XGMI hive. */
> list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
> r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
> @@ -5852,6 +5853,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
> */
> amdgpu_device_stop_pending_resets(tmp_adev);
> }
> + amdgpu_unlock_hw_access(adev);
>
> skip_hw_reset:
>
> @@ -6449,6 +6451,26 @@ int amdgpu_in_reset(struct amdgpu_device *adev)
> return atomic_read(&adev->reset_domain->in_gpu_reset);
> }
>
> +void amdgpu_lock_hw_access(struct amdgpu_device *adev)
> +{
> + down_write(&adev->reset_domain->gpu_sem);
> +}
> +
> +void amdgpu_unlock_hw_access(struct amdgpu_device *adev)
> +{
> + up_write(&adev->reset_domain->gpu_sem);
> +}
> +
> +int amdgpu_begin_hw_access(struct amdgpu_device *adev)
> +{
> + return down_read_trylock(&adev->reset_domain->gpu_sem);
> +}
> +
> +void amdgpu_end_hw_access(struct amdgpu_device *adev)
> +{
> + up_read(&adev->reset_domain->gpu_sem);
> +}
> +
Don't add helpers for that, especially not with that name.
We don't block HW access, but just prevent concurrent resets.
> /**
> * amdgpu_device_halt() - bring hardware to some kind of halt state
> *
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> index 603c0738fd03..098755db9d20 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> @@ -623,12 +623,11 @@ void amdgpu_gmc_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
> !adev->mman.buffer_funcs_enabled ||
> !adev->ib_pool_ready || amdgpu_in_reset(adev) ||
> !ring->sched.ready) {
> -
> /*
> * A GPU reset should flush all TLBs anyway, so no need to do
> * this while one is ongoing.
> */
> - if (!down_read_trylock(&adev->reset_domain->sem))
> + if (!amdgpu_begin_hw_access(adev))
> return;
>
> if (adev->gmc.flush_tlb_needs_extra_type_2)
> @@ -641,7 +640,8 @@ void amdgpu_gmc_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>
> adev->gmc.gmc_funcs->flush_gpu_tlb(adev, vmid, vmhub,
> flush_type);
> - up_read(&adev->reset_domain->sem);
> +
> + amdgpu_end_hw_access(adev);
> return;
> }
>
> @@ -684,12 +684,18 @@ int amdgpu_gmc_flush_gpu_tlb_pasid(struct amdgpu_device *adev, uint16_t pasid,
> struct amdgpu_ring *ring = &adev->gfx.kiq[inst].ring;
> struct amdgpu_kiq *kiq = &adev->gfx.kiq[inst];
> unsigned int ndw;
> - signed long r;
> + signed long r = 0;
Please don't initialize local variables if it isn't necessary.
> uint32_t seq;
>
> - if (!adev->gmc.flush_pasid_uses_kiq || !ring->sched.ready ||
> - !down_read_trylock(&adev->reset_domain->sem)) {
> + /*
> + * A GPU reset should flush all TLBs anyway, so no need to do
> + * this while one is ongoing.
> + */
> + if (!amdgpu_begin_hw_access(adev))
> + return 0;
>
> + if (!adev->gmc.flush_pasid_uses_kiq || !ring->sched.ready ||
> + amdgpu_in_reset(adev)) {
That check doesn't makes sense now any more.
> if (adev->gmc.flush_tlb_needs_extra_type_2)
> adev->gmc.gmc_funcs->flush_gpu_tlb_pasid(adev, pasid,
> 2, all_hub,
> @@ -703,46 +709,42 @@ int amdgpu_gmc_flush_gpu_tlb_pasid(struct amdgpu_device *adev, uint16_t pasid,
> adev->gmc.gmc_funcs->flush_gpu_tlb_pasid(adev, pasid,
> flush_type, all_hub,
> inst);
> - return 0;
> - }
> + } else {
That the locking is missing here should be a separate bug fix
independent of other changes.
Regards,
Christian.
> + /* 2 dwords flush + 8 dwords fence */
> + ndw = kiq->pmf->invalidate_tlbs_size + 8;
>
> - /* 2 dwords flush + 8 dwords fence */
> - ndw = kiq->pmf->invalidate_tlbs_size + 8;
> + if (adev->gmc.flush_tlb_needs_extra_type_2)
> + ndw += kiq->pmf->invalidate_tlbs_size;
>
> - if (adev->gmc.flush_tlb_needs_extra_type_2)
> - ndw += kiq->pmf->invalidate_tlbs_size;
> + if (adev->gmc.flush_tlb_needs_extra_type_0)
> + ndw += kiq->pmf->invalidate_tlbs_size;
>
> - if (adev->gmc.flush_tlb_needs_extra_type_0)
> - ndw += kiq->pmf->invalidate_tlbs_size;
> + spin_lock(&adev->gfx.kiq[inst].ring_lock);
> + amdgpu_ring_alloc(ring, ndw);
> + if (adev->gmc.flush_tlb_needs_extra_type_2)
> + kiq->pmf->kiq_invalidate_tlbs(ring, pasid, 2, all_hub);
>
> - spin_lock(&adev->gfx.kiq[inst].ring_lock);
> - amdgpu_ring_alloc(ring, ndw);
> - if (adev->gmc.flush_tlb_needs_extra_type_2)
> - kiq->pmf->kiq_invalidate_tlbs(ring, pasid, 2, all_hub);
> + if (flush_type == 2 && adev->gmc.flush_tlb_needs_extra_type_0)
> + kiq->pmf->kiq_invalidate_tlbs(ring, pasid, 0, all_hub);
>
> - if (flush_type == 2 && adev->gmc.flush_tlb_needs_extra_type_0)
> - kiq->pmf->kiq_invalidate_tlbs(ring, pasid, 0, all_hub);
> + kiq->pmf->kiq_invalidate_tlbs(ring, pasid, flush_type, all_hub);
> + r = amdgpu_fence_emit_polling(ring, &seq, MAX_KIQ_REG_WAIT);
> + if (r) {
> + amdgpu_ring_undo(ring);
> + spin_unlock(&adev->gfx.kiq[inst].ring_lock);
> + goto error_unlock_reset;
> + }
>
> - kiq->pmf->kiq_invalidate_tlbs(ring, pasid, flush_type, all_hub);
> - r = amdgpu_fence_emit_polling(ring, &seq, MAX_KIQ_REG_WAIT);
> - if (r) {
> - amdgpu_ring_undo(ring);
> + amdgpu_ring_commit(ring);
> spin_unlock(&adev->gfx.kiq[inst].ring_lock);
> - goto error_unlock_reset;
> - }
> -
> - amdgpu_ring_commit(ring);
> - spin_unlock(&adev->gfx.kiq[inst].ring_lock);
> - r = amdgpu_fence_wait_polling(ring, seq, usec_timeout);
> - if (r < 1) {
> - dev_err(adev->dev, "wait for kiq fence error: %ld.\n", r);
> - r = -ETIME;
> - goto error_unlock_reset;
> + if (amdgpu_fence_wait_polling(ring, seq, usec_timeout) < 1) {
> + dev_err(adev->dev, "timeout waiting for kiq fence\n");
> + r = -ETIME;
> + }
> }
> - r = 0;
>
> error_unlock_reset:
> - up_read(&adev->reset_domain->sem);
> + amdgpu_end_hw_access(adev);
> return r;
> }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> index bfdde772b7ee..01b109ec705b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> @@ -144,6 +144,7 @@ struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
> atomic_set(&reset_domain->in_gpu_reset, 0);
> atomic_set(&reset_domain->reset_res, 0);
> init_rwsem(&reset_domain->sem);
> + init_rwsem(&reset_domain->gpu_sem);
>
> return reset_domain;
> }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> index 5a9cc043b858..a8ea493ef0e4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> @@ -86,6 +86,7 @@ struct amdgpu_reset_domain {
> struct workqueue_struct *wq;
> enum amdgpu_reset_domain_type type;
> struct rw_semaphore sem;
> + struct rw_semaphore gpu_sem;
> atomic_t in_gpu_reset;
> atomic_t reset_res;
> };
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> index f4c47492e0cd..38bfd1bb1192 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> @@ -259,11 +259,9 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
> * otherwise the mailbox msg will be ruined/reseted by
> * the VF FLR.
> */
> - if (atomic_cmpxchg(&adev->reset_domain->in_gpu_reset, 0, 1) != 0)
> + if (!amdgpu_begin_hw_access(adev))
> return;
>
> - down_write(&adev->reset_domain->sem);
> -
> amdgpu_virt_fini_data_exchange(adev);
>
> xgpu_ai_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0, 0, 0);
> @@ -279,8 +277,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
> dev_warn(adev->dev, "waiting IDH_FLR_NOTIFICATION_CMPL timeout\n");
>
> flr_done:
> - atomic_set(&adev->reset_domain->in_gpu_reset, 0);
> - up_write(&adev->reset_domain->sem);
> + amdgpu_end_hw_access(adev);
>
> /* Trigger recovery for world switch failure if no TDR */
> if (amdgpu_device_should_recover_gpu(adev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> index 37b49a5ed2a1..522198b511d7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> @@ -292,11 +292,9 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
> * otherwise the mailbox msg will be ruined/reseted by
> * the VF FLR.
> */
> - if (atomic_cmpxchg(&adev->reset_domain->in_gpu_reset, 0, 1) != 0)
> + if (!amdgpu_begin_hw_access(adev))
> return;
>
> - down_write(&adev->reset_domain->sem);
> -
> amdgpu_virt_fini_data_exchange(adev);
>
> xgpu_nv_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0, 0, 0);
> @@ -312,8 +310,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
> dev_warn(adev->dev, "waiting IDH_FLR_NOTIFICATION_CMPL timeout\n");
>
> flr_done:
> - atomic_set(&adev->reset_domain->in_gpu_reset, 0);
> - up_write(&adev->reset_domain->sem);
> + amdgpu_end_hw_access(adev);
>
> /* Trigger recovery for world switch failure if no TDR */
> if (amdgpu_device_should_recover_gpu(adev)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 4721b2fccd06..ed96ed46d912 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -262,7 +262,7 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
> int r;
> struct mes_remove_queue_input queue_input;
>
> - if (dqm->is_hws_hang)
> + if (dqm->is_hws_hang || !amdgpu_begin_hw_access(adev))
> return -EIO;
>
> memset(&queue_input, 0x0, sizeof(struct mes_remove_queue_input));
> @@ -272,6 +272,7 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
> amdgpu_mes_lock(&adev->mes);
> r = adev->mes.funcs->remove_hw_queue(&adev->mes, &queue_input);
> amdgpu_mes_unlock(&adev->mes);
> + amdgpu_end_hw_access(adev);
>
> if (r) {
> dev_err(adev->dev, "failed to remove hardware queue from MES, doorbell=0x%x\n",
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> index 6bf79c435f2e..c2bccc78525c 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> @@ -87,8 +87,12 @@ void kfd_process_dequeue_from_device(struct kfd_process_device *pdd)
> return;
>
> dev->dqm->ops.process_termination(dev->dqm, &pdd->qpd);
> - if (dev->kfd->shared_resources.enable_mes)
> - amdgpu_mes_flush_shader_debugger(dev->adev, pdd->proc_ctx_gpu_addr);
> + if (dev->kfd->shared_resources.enable_mes &&
> + amdgpu_begin_hw_access(dev->adev)) {
> + amdgpu_mes_flush_shader_debugger(dev->adev,
> + pdd->proc_ctx_gpu_addr);
> + amdgpu_end_hw_access(dev->adev);
> + }
> pdd->already_dequeued = true;
> }
>
More information about the amd-gfx
mailing list