[PATCH] drm/amdgpu: fix system hang issue during GPU reset

Li, Dennis Dennis.Li at amd.com
Tue Jul 7 01:16:04 UTC 2020


[AMD Official Use Only - Internal Distribution Only]

Hi, Felix,
      Driver should use the same lock to protect hardware from being accessed during GPU reset. The flag dqm->is_resetting couldn't prevent calls that access hardware in multi-threads case. 

Best Regards
Dennis Li
-----Original Message-----
From: Kuehling, Felix <Felix.Kuehling at amd.com> 
Sent: Tuesday, July 7, 2020 5:43 AM
To: Li, Dennis <Dennis.Li at amd.com>; amd-gfx at lists.freedesktop.org; Deucher, Alexander <Alexander.Deucher at amd.com>; Zhou1, Tao <Tao.Zhou1 at amd.com>; Zhang, Hawking <Hawking.Zhang at amd.com>; Chen, Guchun <Guchun.Chen at amd.com>
Subject: Re: [PATCH] drm/amdgpu: fix system hang issue during GPU reset


Am 2020-07-06 um 6:01 a.m. schrieb Dennis Li:
> During GPU reset, driver should hold on all external access to GPU, 
> otherwise psp will randomly fail to do post, and then cause system 
> hang.
>
> Signed-off-by: Dennis Li <dennis.li at amd.com>
> Change-Id: I7d5d41f9c4198b917d7b49606ba3850988e5b936
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 6c7dd0a707c9..34bfc2a147ff 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -965,7 +965,7 @@ struct amdgpu_device {
>  
>  	bool                            in_gpu_reset;
>  	enum pp_mp1_state               mp1_state;
> -	struct mutex  lock_reset;
> +	struct rw_semaphore	reset_sem;
>  	struct amdgpu_doorbell_index doorbell_index;
>  
>  	struct mutex			notifier_lock;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index ad59ac4423b8..4139c81389a4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -611,7 +611,9 @@ int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine,
>  	/* This works for NO_HWS. TODO: need to handle without knowing VMID */
>  	job->vmid = vmid;
>  
> +	down_read(&adev->reset_sem);

This (and other instances below) will introduce some lock dependency issues. Any lock that you take under KFD's DQM lock will inherit the problem that you can't reclaim memory while holding it because the DQM lock is taken in MMU notifiers. That will affect any attempts of allocating memory while holding the reset_sem.

DQM already has an internal flag dqm->is_resetting that is set in the KFD pre_reset callback. It would be better to use that in DQM to prevent any calls that access hardware.

Regards,
  Felix


>  	ret = amdgpu_ib_schedule(ring, 1, ib, job, &f);
> +	up_read(&adev->reset_sem);
>  	if (ret) {
>  		DRM_ERROR("amdgpu: failed to schedule IB.\n");
>  		goto err_ib_sched;
> @@ -649,6 +651,8 @@ int amdgpu_amdkfd_flush_gpu_tlb_vmid(struct 
> kgd_dev *kgd, uint16_t vmid)  {
>  	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
>  
> +	down_read(&adev->reset_sem);
> +
>  	if (adev->family == AMDGPU_FAMILY_AI) {
>  		int i;
>  
> @@ -658,6 +662,8 @@ int amdgpu_amdkfd_flush_gpu_tlb_vmid(struct kgd_dev *kgd, uint16_t vmid)
>  		amdgpu_gmc_flush_gpu_tlb(adev, vmid, AMDGPU_GFXHUB_0, 0);
>  	}
>  
> +	up_read(&adev->reset_sem);
> +
>  	return 0;
>  }
>  
> @@ -666,11 +672,18 @@ int amdgpu_amdkfd_flush_gpu_tlb_pasid(struct kgd_dev *kgd, uint16_t pasid)
>  	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
>  	const uint32_t flush_type = 0;
>  	bool all_hub = false;
> +	int ret = 0;
>  
>  	if (adev->family == AMDGPU_FAMILY_AI)
>  		all_hub = true;
>  
> -	return amdgpu_gmc_flush_gpu_tlb_pasid(adev, pasid, flush_type, all_hub);
> +	down_read(&adev->reset_sem);
> +
> +	ret = amdgpu_gmc_flush_gpu_tlb_pasid(adev, pasid, flush_type, 
> +all_hub);
> +
> +	up_read(&adev->reset_sem);
> +
> +	return ret;
>  }
>  
>  bool amdgpu_amdkfd_have_atomics_support(struct kgd_dev *kgd) diff 
> --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> index 691c89705bcd..db5d533dd406 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> @@ -542,6 +542,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
>  	unsigned long end_jiffies;
>  	uint32_t temp;
>  	struct v10_compute_mqd *m = get_mqd(mqd);
> +	int ret = 0;
>  
>  	if (adev->in_gpu_reset)
>  		return -EIO;
> @@ -551,6 +552,8 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
>  	int retry;
>  #endif
>  
> +	down_read(&adev->reset_sem);
> +
>  	acquire_queue(kgd, pipe_id, queue_id);
>  
>  	if (m->cp_hqd_vmid == 0)
> @@ -633,14 +636,16 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
>  			break;
>  		if (time_after(jiffies, end_jiffies)) {
>  			pr_err("cp queue preemption time out.\n");
> -			release_queue(kgd);
> -			return -ETIME;
> +			ret = -ETIME;
> +			goto pro_end;
>  		}
>  		usleep_range(500, 1000);
>  	}
>  
> +pro_end:
>  	release_queue(kgd);
> -	return 0;
> +	up_read(&adev->reset_sem);
> +	return ret;
>  }
>  
>  static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, diff 
> --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
> index 0b7e78748540..cf27fe5091aa 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
> @@ -424,10 +424,13 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
>  	enum hqd_dequeue_request_type type;
>  	unsigned long flags, end_jiffies;
>  	int retry;
> +	int ret = 0;
>  
>  	if (adev->in_gpu_reset)
>  		return -EIO;
>  
> +	down_read(&adev->reset_sem);
> +
>  	acquire_queue(kgd, pipe_id, queue_id);
>  	WREG32(mmCP_HQD_PQ_DOORBELL_CONTROL, 0);
>  
> @@ -506,14 +509,16 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
>  			break;
>  		if (time_after(jiffies, end_jiffies)) {
>  			pr_err("cp queue preemption time out\n");
> -			release_queue(kgd);
> -			return -ETIME;
> +			ret = -ETIME;
> +			goto pro_end;
>  		}
>  		usleep_range(500, 1000);
>  	}
>  
> +pro_end:
>  	release_queue(kgd);
> -	return 0;
> +	up_read(&adev->reset_sem);
> +	return ret;
>  }
>  
>  static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, diff 
> --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
> index ccd635b812b5..bc8e07186a85 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
> @@ -420,10 +420,13 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
>  	unsigned long flags, end_jiffies;
>  	int retry;
>  	struct vi_mqd *m = get_mqd(mqd);
> +	int ret = 0;
>  
>  	if (adev->in_gpu_reset)
>  		return -EIO;
>  
> +	down_read(&adev->reset_sem);
> +
>  	acquire_queue(kgd, pipe_id, queue_id);
>  
>  	if (m->cp_hqd_vmid == 0)
> @@ -504,14 +507,16 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
>  			break;
>  		if (time_after(jiffies, end_jiffies)) {
>  			pr_err("cp queue preemption time out.\n");
> -			release_queue(kgd);
> -			return -ETIME;
> +			ret = -ETIME;
> +			goto pro_end;
>  		}
>  		usleep_range(500, 1000);
>  	}
>  
> +pro_end:
>  	release_queue(kgd);
> -	return 0;
> +	up_read(&adev->reset_sem);
> +	return ret;
>  }
>  
>  static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, diff 
> --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> index df841c2ac5e7..341ad652d910 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> @@ -540,10 +540,13 @@ int kgd_gfx_v9_hqd_destroy(struct kgd_dev *kgd, void *mqd,
>  	unsigned long end_jiffies;
>  	uint32_t temp;
>  	struct v9_mqd *m = get_mqd(mqd);
> +	int ret = 0;
>  
>  	if (adev->in_gpu_reset)
>  		return -EIO;
>  
> +	down_read(&adev->reset_sem);
> +
>  	acquire_queue(kgd, pipe_id, queue_id);
>  
>  	if (m->cp_hqd_vmid == 0)
> @@ -570,14 +573,16 @@ int kgd_gfx_v9_hqd_destroy(struct kgd_dev *kgd, void *mqd,
>  			break;
>  		if (time_after(jiffies, end_jiffies)) {
>  			pr_err("cp queue preemption time out.\n");
> -			release_queue(kgd);
> -			return -ETIME;
> +			ret = -ETIME;
> +			goto pro_end;
>  		}
>  		usleep_range(500, 1000);
>  	}
>  
> +pro_end:
>  	release_queue(kgd);
> -	return 0;
> +	up_read(&adev->reset_sem);
> +	return ret;
>  }
>  
>  static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd, diff 
> --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> index aeada7c9fbea..d8f264c47b86 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> @@ -100,14 +100,14 @@ static int amdgpu_debugfs_autodump_open(struct 
> inode *inode, struct file *file)
>  
>  	file->private_data = adev;
>  
> -	mutex_lock(&adev->lock_reset);
> +	down_read(&adev->reset_sem);
>  	if (adev->autodump.dumping.done) {
>  		reinit_completion(&adev->autodump.dumping);
>  		ret = 0;
>  	} else {
>  		ret = -EBUSY;
>  	}
> -	mutex_unlock(&adev->lock_reset);
> +	up_read(&adev->reset_sem);
>  
>  	return ret;
>  }
> @@ -1188,7 +1188,7 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data)
>  	}
>  
>  	/* Avoid accidently unparking the sched thread during GPU reset */
> -	mutex_lock(&adev->lock_reset);
> +	down_read(&adev->reset_sem);
>  
>  	/* hold on the scheduler */
>  	for (i = 0; i < AMDGPU_MAX_RINGS; i++) { @@ -1215,7 +1215,7 @@ 
> static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data)
>  		kthread_unpark(ring->sched.thread);
>  	}
>  
> -	mutex_unlock(&adev->lock_reset);
> +	up_read(&adev->reset_sem);
>  
>  	pm_runtime_mark_last_busy(dev->dev);
>  	pm_runtime_put_autosuspend(dev->dev);
> @@ -1395,7 +1395,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
>  		return -ENOMEM;
>  
>  	/* Avoid accidently unparking the sched thread during GPU reset */
> -	mutex_lock(&adev->lock_reset);
> +	down_read(&adev->reset_sem);
>  
>  	/* stop the scheduler */
>  	kthread_park(ring->sched.thread);
> @@ -1436,7 +1436,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
>  	/* restart the scheduler */
>  	kthread_unpark(ring->sched.thread);
>  
> -	mutex_unlock(&adev->lock_reset);
> +	up_read(&adev->reset_sem);
>  
>  	ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched);
>  
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 2858c09fd8c0..bc902c59c1c0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3044,7 +3044,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
>  	mutex_init(&adev->mn_lock);
>  	mutex_init(&adev->virt.vf_errors.lock);
>  	hash_init(adev->mn_hash);
> -	mutex_init(&adev->lock_reset);
> +	init_rwsem(&adev->reset_sem);
>  	mutex_init(&adev->psp.mutex);
>  	mutex_init(&adev->notifier_lock);
>  
> @@ -4150,10 +4150,10 @@ static int amdgpu_do_asic_reset(struct 
> amdgpu_hive_info *hive,  static bool amdgpu_device_lock_adev(struct 
> amdgpu_device *adev, bool trylock)  {
>  	if (trylock) {
> -		if (!mutex_trylock(&adev->lock_reset))
> +		if (!down_write_trylock(&adev->reset_sem))
>  			return false;
>  	} else
> -		mutex_lock(&adev->lock_reset);
> +		down_write(&adev->reset_sem);
>  
>  	atomic_inc(&adev->gpu_reset_counter);
>  	adev->in_gpu_reset = true;
> @@ -4177,7 +4177,7 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
>  	amdgpu_vf_error_trans_all(adev);
>  	adev->mp1_state = PP_MP1_STATE_NONE;
>  	adev->in_gpu_reset = false;
> -	mutex_unlock(&adev->lock_reset);
> +	up_write(&adev->reset_sem);
>  }
>  
>  static void amdgpu_device_resume_display_audio(struct amdgpu_device 
> *adev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index 2975c4a6e581..d4c69f9577a4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -225,8 +225,10 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
>  	if (finished->error < 0) {
>  		DRM_INFO("Skip scheduling IBs!\n");
>  	} else {
> +		down_read(&ring->adev->reset_sem);
>  		r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
>  				       &fence);
> +		up_read(&ring->adev->reset_sem);
>  		if (r)
>  			DRM_ERROR("Error scheduling IBs (%d)\n", r);
>  	}
> @@ -237,6 +239,7 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
>  	amdgpu_job_free_resources(job);
>  
>  	fence = r ? ERR_PTR(r) : fence;
> +
>  	return fence;
>  }
>  
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> index 13ea8ebc421c..38a751f7d889 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> @@ -121,6 +121,7 @@ void amdgpu_ring_generic_pad_ib(struct amdgpu_ring 
> *ring, struct amdgpu_ib *ib)  void amdgpu_ring_commit(struct 
> amdgpu_ring *ring)  {
>  	uint32_t count;
> +	struct amdgpu_device *adev = ring->adev;
>  
>  	/* We pad to match fetch size */
>  	count = ring->funcs->align_mask + 1 - diff --git 
> a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c 
> b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> index 5fd67e1cc2a0..97f33540aa02 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> @@ -244,11 +244,11 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
>  	 * otherwise the mailbox msg will be ruined/reseted by
>  	 * the VF FLR.
>  	 *
> -	 * we can unlock the lock_reset to allow "amdgpu_job_timedout"
> +	 * we can unlock the reset_sem to allow "amdgpu_job_timedout"
>  	 * to run gpu_recover() after FLR_NOTIFICATION_CMPL received
>  	 * which means host side had finished this VF's FLR.
>  	 */
> -	locked = mutex_trylock(&adev->lock_reset);
> +	locked = down_write_trylock(&adev->reset_sem);
>  	if (locked)
>  		adev->in_gpu_reset = true;
>  
> @@ -263,7 +263,7 @@ static void xgpu_ai_mailbox_flr_work(struct 
> work_struct *work)
>  flr_done:
>  	if (locked) {
>  		adev->in_gpu_reset = false;
> -		mutex_unlock(&adev->lock_reset);
> +		up_write(&adev->reset_sem);
>  	}
>  
>  	/* Trigger recovery for world switch failure if no TDR */ diff --git 
> a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c 
> b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> index ce2bf1fb79ed..484d61c22396 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> @@ -265,11 +265,11 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
>  	 * otherwise the mailbox msg will be ruined/reseted by
>  	 * the VF FLR.
>  	 *
> -	 * we can unlock the lock_reset to allow "amdgpu_job_timedout"
> +	 * we can unlock the reset_sem to allow "amdgpu_job_timedout"
>  	 * to run gpu_recover() after FLR_NOTIFICATION_CMPL received
>  	 * which means host side had finished this VF's FLR.
>  	 */
> -	locked = mutex_trylock(&adev->lock_reset);
> +	locked = down_write_trylock(&adev->reset_sem);
>  	if (locked)
>  		adev->in_gpu_reset = true;
>  
> @@ -284,7 +284,7 @@ static void xgpu_nv_mailbox_flr_work(struct 
> work_struct *work)
>  flr_done:
>  	if (locked) {
>  		adev->in_gpu_reset = false;
> -		mutex_unlock(&adev->lock_reset);
> +		up_write(&adev->reset_sem);
>  	}
>  
>  	/* Trigger recovery for world switch failure if no TDR */


More information about the amd-gfx mailing list