[PATCH 4/4] drm/amdgpu: prevent gpu access during reset recovery

Christian König christian.koenig at amd.com
Thu May 23 07:16:23 UTC 2024


Am 22.05.24 um 19:27 schrieb Yunxiang Li:
> Random accesses to the GPU while it is not re-initialized can lead to a
> bad time. So add a rwsem to prevent such accesses. Normal accesses will
> now take the read lock for shared GPU access, reset takes the write lock
> for exclusive GPU access.
>
> Care need to be taken so that the recovery thread does not take the read
> lock and deadlock itself, and normal access should avoid waiting on the
> reset to finish and should instead treat the hardware access as failed.
>
> Signed-off-by: Yunxiang Li <Yunxiang.Li at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h           |  5 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c    | 22 ++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c       | 74 ++++++++++---------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c     |  1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h     |  1 +
>   drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c         |  7 +-
>   drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c         |  7 +-
>   .../drm/amd/amdkfd/kfd_device_queue_manager.c |  3 +-
>   .../amd/amdkfd/kfd_process_queue_manager.c    |  8 +-
>   9 files changed, 79 insertions(+), 49 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 1f71c7b98d77..5a089e2dec2a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1632,6 +1632,11 @@ static inline bool amdgpu_is_tmz(struct amdgpu_device *adev)
>   
>   int amdgpu_in_reset(struct amdgpu_device *adev);
>   
> +void amdgpu_lock_hw_access(struct amdgpu_device *adev);
> +void amdgpu_unlock_hw_access(struct amdgpu_device *adev);
> +int amdgpu_begin_hw_access(struct amdgpu_device *adev);
> +void amdgpu_end_hw_access(struct amdgpu_device *adev);
> +

Don't add anything to amdgpu.h. We are slowly decomposing that file.

>   extern const struct attribute_group amdgpu_vram_mgr_attr_group;
>   extern const struct attribute_group amdgpu_gtt_mgr_attr_group;
>   extern const struct attribute_group amdgpu_flash_attr_group;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index e74789691070..057d735c7cae 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -5816,6 +5816,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   		goto skip_hw_reset;
>   	}
>   
> +	amdgpu_lock_hw_access(adev);

That should already be locked here. So this will most likely deadlock.

>   retry:	/* Rest of adevs pre asic reset from XGMI hive. */
>   	list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
>   		r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
> @@ -5852,6 +5853,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>   		 */
>   		amdgpu_device_stop_pending_resets(tmp_adev);
>   	}
> +	amdgpu_unlock_hw_access(adev);
>   
>   skip_hw_reset:
>   
> @@ -6449,6 +6451,26 @@ int amdgpu_in_reset(struct amdgpu_device *adev)
>   	return atomic_read(&adev->reset_domain->in_gpu_reset);
>   }
>   
> +void amdgpu_lock_hw_access(struct amdgpu_device *adev)
> +{
> +	down_write(&adev->reset_domain->gpu_sem);
> +}
> +
> +void amdgpu_unlock_hw_access(struct amdgpu_device *adev)
> +{
> +	up_write(&adev->reset_domain->gpu_sem);
> +}
> +
> +int amdgpu_begin_hw_access(struct amdgpu_device *adev)
> +{
> +	return down_read_trylock(&adev->reset_domain->gpu_sem);
> +}
> +
> +void amdgpu_end_hw_access(struct amdgpu_device *adev)
> +{
> +	up_read(&adev->reset_domain->gpu_sem);
> +}
> +

Don't add helpers for that, especially not with that name.

We don't block HW access, but just prevent concurrent resets.

>   /**
>    * amdgpu_device_halt() - bring hardware to some kind of halt state
>    *
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> index 603c0738fd03..098755db9d20 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> @@ -623,12 +623,11 @@ void amdgpu_gmc_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>   	    !adev->mman.buffer_funcs_enabled ||
>   	    !adev->ib_pool_ready || amdgpu_in_reset(adev) ||
>   	    !ring->sched.ready) {
> -
>   		/*
>   		 * A GPU reset should flush all TLBs anyway, so no need to do
>   		 * this while one is ongoing.
>   		 */
> -		if (!down_read_trylock(&adev->reset_domain->sem))
> +		if (!amdgpu_begin_hw_access(adev))
>   			return;
>   
>   		if (adev->gmc.flush_tlb_needs_extra_type_2)
> @@ -641,7 +640,8 @@ void amdgpu_gmc_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid,
>   
>   		adev->gmc.gmc_funcs->flush_gpu_tlb(adev, vmid, vmhub,
>   						   flush_type);
> -		up_read(&adev->reset_domain->sem);
> +
> +		amdgpu_end_hw_access(adev);
>   		return;
>   	}
>   
> @@ -684,12 +684,18 @@ int amdgpu_gmc_flush_gpu_tlb_pasid(struct amdgpu_device *adev, uint16_t pasid,
>   	struct amdgpu_ring *ring = &adev->gfx.kiq[inst].ring;
>   	struct amdgpu_kiq *kiq = &adev->gfx.kiq[inst];
>   	unsigned int ndw;
> -	signed long r;
> +	signed long r = 0;

Please don't initialize local variables if it isn't necessary.

>   	uint32_t seq;
>   
> -	if (!adev->gmc.flush_pasid_uses_kiq || !ring->sched.ready ||
> -	    !down_read_trylock(&adev->reset_domain->sem)) {
> +	/*
> +	* A GPU reset should flush all TLBs anyway, so no need to do
> +	* this while one is ongoing.
> +	*/
> +	if (!amdgpu_begin_hw_access(adev))
> +		return 0;
>   
> +	if (!adev->gmc.flush_pasid_uses_kiq || !ring->sched.ready ||
> +	    amdgpu_in_reset(adev)) {

That check doesn't makes sense now any more.

>   		if (adev->gmc.flush_tlb_needs_extra_type_2)
>   			adev->gmc.gmc_funcs->flush_gpu_tlb_pasid(adev, pasid,
>   								 2, all_hub,
> @@ -703,46 +709,42 @@ int amdgpu_gmc_flush_gpu_tlb_pasid(struct amdgpu_device *adev, uint16_t pasid,
>   		adev->gmc.gmc_funcs->flush_gpu_tlb_pasid(adev, pasid,
>   							 flush_type, all_hub,
>   							 inst);
> -		return 0;
> -	}
> +	} else {

That the locking is missing here should be a separate bug fix 
independent of other changes.

Regards,
Christian.

> +		/* 2 dwords flush + 8 dwords fence */
> +		ndw = kiq->pmf->invalidate_tlbs_size + 8;
>   
> -	/* 2 dwords flush + 8 dwords fence */
> -	ndw = kiq->pmf->invalidate_tlbs_size + 8;
> +		if (adev->gmc.flush_tlb_needs_extra_type_2)
> +			ndw += kiq->pmf->invalidate_tlbs_size;
>   
> -	if (adev->gmc.flush_tlb_needs_extra_type_2)
> -		ndw += kiq->pmf->invalidate_tlbs_size;
> +		if (adev->gmc.flush_tlb_needs_extra_type_0)
> +			ndw += kiq->pmf->invalidate_tlbs_size;
>   
> -	if (adev->gmc.flush_tlb_needs_extra_type_0)
> -		ndw += kiq->pmf->invalidate_tlbs_size;
> +		spin_lock(&adev->gfx.kiq[inst].ring_lock);
> +		amdgpu_ring_alloc(ring, ndw);
> +		if (adev->gmc.flush_tlb_needs_extra_type_2)
> +			kiq->pmf->kiq_invalidate_tlbs(ring, pasid, 2, all_hub);
>   
> -	spin_lock(&adev->gfx.kiq[inst].ring_lock);
> -	amdgpu_ring_alloc(ring, ndw);
> -	if (adev->gmc.flush_tlb_needs_extra_type_2)
> -		kiq->pmf->kiq_invalidate_tlbs(ring, pasid, 2, all_hub);
> +		if (flush_type == 2 && adev->gmc.flush_tlb_needs_extra_type_0)
> +			kiq->pmf->kiq_invalidate_tlbs(ring, pasid, 0, all_hub);
>   
> -	if (flush_type == 2 && adev->gmc.flush_tlb_needs_extra_type_0)
> -		kiq->pmf->kiq_invalidate_tlbs(ring, pasid, 0, all_hub);
> +		kiq->pmf->kiq_invalidate_tlbs(ring, pasid, flush_type, all_hub);
> +		r = amdgpu_fence_emit_polling(ring, &seq, MAX_KIQ_REG_WAIT);
> +		if (r) {
> +			amdgpu_ring_undo(ring);
> +			spin_unlock(&adev->gfx.kiq[inst].ring_lock);
> +			goto error_unlock_reset;
> +		}
>   
> -	kiq->pmf->kiq_invalidate_tlbs(ring, pasid, flush_type, all_hub);
> -	r = amdgpu_fence_emit_polling(ring, &seq, MAX_KIQ_REG_WAIT);
> -	if (r) {
> -		amdgpu_ring_undo(ring);
> +		amdgpu_ring_commit(ring);
>   		spin_unlock(&adev->gfx.kiq[inst].ring_lock);
> -		goto error_unlock_reset;
> -	}
> -
> -	amdgpu_ring_commit(ring);
> -	spin_unlock(&adev->gfx.kiq[inst].ring_lock);
> -	r = amdgpu_fence_wait_polling(ring, seq, usec_timeout);
> -	if (r < 1) {
> -		dev_err(adev->dev, "wait for kiq fence error: %ld.\n", r);
> -		r = -ETIME;
> -		goto error_unlock_reset;
> +		if (amdgpu_fence_wait_polling(ring, seq, usec_timeout) < 1) {
> +			dev_err(adev->dev, "timeout waiting for kiq fence\n");
> +			r = -ETIME;
> +		}
>   	}
> -	r = 0;
>   
>   error_unlock_reset:
> -	up_read(&adev->reset_domain->sem);
> +	amdgpu_end_hw_access(adev);
>   	return r;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> index bfdde772b7ee..01b109ec705b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> @@ -144,6 +144,7 @@ struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
>   	atomic_set(&reset_domain->in_gpu_reset, 0);
>   	atomic_set(&reset_domain->reset_res, 0);
>   	init_rwsem(&reset_domain->sem);
> +	init_rwsem(&reset_domain->gpu_sem);
>   
>   	return reset_domain;
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> index 5a9cc043b858..a8ea493ef0e4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> @@ -86,6 +86,7 @@ struct amdgpu_reset_domain {
>   	struct workqueue_struct *wq;
>   	enum amdgpu_reset_domain_type type;
>   	struct rw_semaphore sem;
> +	struct rw_semaphore gpu_sem;
>   	atomic_t in_gpu_reset;
>   	atomic_t reset_res;
>   };
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> index f4c47492e0cd..38bfd1bb1192 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> @@ -259,11 +259,9 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
>   	 * otherwise the mailbox msg will be ruined/reseted by
>   	 * the VF FLR.
>   	 */
> -	if (atomic_cmpxchg(&adev->reset_domain->in_gpu_reset, 0, 1) != 0)
> +	if (!amdgpu_begin_hw_access(adev))
>   		return;
>   
> -	down_write(&adev->reset_domain->sem);
> -
>   	amdgpu_virt_fini_data_exchange(adev);
>   
>   	xgpu_ai_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0, 0, 0);
> @@ -279,8 +277,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
>   	dev_warn(adev->dev, "waiting IDH_FLR_NOTIFICATION_CMPL timeout\n");
>   
>   flr_done:
> -	atomic_set(&adev->reset_domain->in_gpu_reset, 0);
> -	up_write(&adev->reset_domain->sem);
> +	amdgpu_end_hw_access(adev);
>   
>   	/* Trigger recovery for world switch failure if no TDR */
>   	if (amdgpu_device_should_recover_gpu(adev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> index 37b49a5ed2a1..522198b511d7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> @@ -292,11 +292,9 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
>   	 * otherwise the mailbox msg will be ruined/reseted by
>   	 * the VF FLR.
>   	 */
> -	if (atomic_cmpxchg(&adev->reset_domain->in_gpu_reset, 0, 1) != 0)
> +	if (!amdgpu_begin_hw_access(adev))
>   		return;
>   
> -	down_write(&adev->reset_domain->sem);
> -
>   	amdgpu_virt_fini_data_exchange(adev);
>   
>   	xgpu_nv_mailbox_trans_msg(adev, IDH_READY_TO_RESET, 0, 0, 0);
> @@ -312,8 +310,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
>   	dev_warn(adev->dev, "waiting IDH_FLR_NOTIFICATION_CMPL timeout\n");
>   
>   flr_done:
> -	atomic_set(&adev->reset_domain->in_gpu_reset, 0);
> -	up_write(&adev->reset_domain->sem);
> +	amdgpu_end_hw_access(adev);
>   
>   	/* Trigger recovery for world switch failure if no TDR */
>   	if (amdgpu_device_should_recover_gpu(adev)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 4721b2fccd06..ed96ed46d912 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -262,7 +262,7 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
>   	int r;
>   	struct mes_remove_queue_input queue_input;
>   
> -	if (dqm->is_hws_hang)
> +	if (dqm->is_hws_hang || !amdgpu_begin_hw_access(adev))
>   		return -EIO;
>   
>   	memset(&queue_input, 0x0, sizeof(struct mes_remove_queue_input));
> @@ -272,6 +272,7 @@ static int remove_queue_mes(struct device_queue_manager *dqm, struct queue *q,
>   	amdgpu_mes_lock(&adev->mes);
>   	r = adev->mes.funcs->remove_hw_queue(&adev->mes, &queue_input);
>   	amdgpu_mes_unlock(&adev->mes);
> +	amdgpu_end_hw_access(adev);
>   
>   	if (r) {
>   		dev_err(adev->dev, "failed to remove hardware queue from MES, doorbell=0x%x\n",
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> index 6bf79c435f2e..c2bccc78525c 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> @@ -87,8 +87,12 @@ void kfd_process_dequeue_from_device(struct kfd_process_device *pdd)
>   		return;
>   
>   	dev->dqm->ops.process_termination(dev->dqm, &pdd->qpd);
> -	if (dev->kfd->shared_resources.enable_mes)
> -		amdgpu_mes_flush_shader_debugger(dev->adev, pdd->proc_ctx_gpu_addr);
> +	if (dev->kfd->shared_resources.enable_mes &&
> +	    amdgpu_begin_hw_access(dev->adev)) {
> +		amdgpu_mes_flush_shader_debugger(dev->adev,
> +						 pdd->proc_ctx_gpu_addr);
> +		amdgpu_end_hw_access(dev->adev);
> +	}
>   	pdd->already_dequeued = true;
>   }
>   



More information about the amd-gfx mailing list