[PATCH] drm/amdgpu: fix system hang issue during GPU reset
Christian König
ckoenig.leichtzumerken at gmail.com
Mon Jul 6 10:44:42 UTC 2020
Am 06.07.20 um 12:01 schrieb Dennis Li:
> During GPU reset, driver should hold on all external access to
> GPU, otherwise psp will randomly fail to do post, and then cause
> system hang.
In general a good idea, but that exposes another problem: The trylock
has now a rather right chance to fail and that is not expected.
Christian.
>
> Signed-off-by: Dennis Li <dennis.li at amd.com>
> Change-Id: I7d5d41f9c4198b917d7b49606ba3850988e5b936
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 6c7dd0a707c9..34bfc2a147ff 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -965,7 +965,7 @@ struct amdgpu_device {
>
> bool in_gpu_reset;
> enum pp_mp1_state mp1_state;
> - struct mutex lock_reset;
> + struct rw_semaphore reset_sem;
> struct amdgpu_doorbell_index doorbell_index;
>
> struct mutex notifier_lock;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index ad59ac4423b8..4139c81389a4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -611,7 +611,9 @@ int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine,
> /* This works for NO_HWS. TODO: need to handle without knowing VMID */
> job->vmid = vmid;
>
> + down_read(&adev->reset_sem);
> ret = amdgpu_ib_schedule(ring, 1, ib, job, &f);
> + up_read(&adev->reset_sem);
> if (ret) {
> DRM_ERROR("amdgpu: failed to schedule IB.\n");
> goto err_ib_sched;
> @@ -649,6 +651,8 @@ int amdgpu_amdkfd_flush_gpu_tlb_vmid(struct kgd_dev *kgd, uint16_t vmid)
> {
> struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
>
> + down_read(&adev->reset_sem);
> +
> if (adev->family == AMDGPU_FAMILY_AI) {
> int i;
>
> @@ -658,6 +662,8 @@ int amdgpu_amdkfd_flush_gpu_tlb_vmid(struct kgd_dev *kgd, uint16_t vmid)
> amdgpu_gmc_flush_gpu_tlb(adev, vmid, AMDGPU_GFXHUB_0, 0);
> }
>
> + up_read(&adev->reset_sem);
> +
> return 0;
> }
>
> @@ -666,11 +672,18 @@ int amdgpu_amdkfd_flush_gpu_tlb_pasid(struct kgd_dev *kgd, uint16_t pasid)
> struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
> const uint32_t flush_type = 0;
> bool all_hub = false;
> + int ret = 0;
>
> if (adev->family == AMDGPU_FAMILY_AI)
> all_hub = true;
>
> - return amdgpu_gmc_flush_gpu_tlb_pasid(adev, pasid, flush_type, all_hub);
> + down_read(&adev->reset_sem);
> +
> + ret = amdgpu_gmc_flush_gpu_tlb_pasid(adev, pasid, flush_type, all_hub);
> +
> + up_read(&adev->reset_sem);
> +
> + return ret;
> }
>
> bool amdgpu_amdkfd_have_atomics_support(struct kgd_dev *kgd)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> index 691c89705bcd..db5d533dd406 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
> @@ -542,6 +542,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
> unsigned long end_jiffies;
> uint32_t temp;
> struct v10_compute_mqd *m = get_mqd(mqd);
> + int ret = 0;
>
> if (adev->in_gpu_reset)
> return -EIO;
> @@ -551,6 +552,8 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
> int retry;
> #endif
>
> + down_read(&adev->reset_sem);
> +
> acquire_queue(kgd, pipe_id, queue_id);
>
> if (m->cp_hqd_vmid == 0)
> @@ -633,14 +636,16 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
> break;
> if (time_after(jiffies, end_jiffies)) {
> pr_err("cp queue preemption time out.\n");
> - release_queue(kgd);
> - return -ETIME;
> + ret = -ETIME;
> + goto pro_end;
> }
> usleep_range(500, 1000);
> }
>
> +pro_end:
> release_queue(kgd);
> - return 0;
> + up_read(&adev->reset_sem);
> + return ret;
> }
>
> static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
> index 0b7e78748540..cf27fe5091aa 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
> @@ -424,10 +424,13 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
> enum hqd_dequeue_request_type type;
> unsigned long flags, end_jiffies;
> int retry;
> + int ret = 0;
>
> if (adev->in_gpu_reset)
> return -EIO;
>
> + down_read(&adev->reset_sem);
> +
> acquire_queue(kgd, pipe_id, queue_id);
> WREG32(mmCP_HQD_PQ_DOORBELL_CONTROL, 0);
>
> @@ -506,14 +509,16 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
> break;
> if (time_after(jiffies, end_jiffies)) {
> pr_err("cp queue preemption time out\n");
> - release_queue(kgd);
> - return -ETIME;
> + ret = -ETIME;
> + goto pro_end;
> }
> usleep_range(500, 1000);
> }
>
> +pro_end:
> release_queue(kgd);
> - return 0;
> + up_read(&adev->reset_sem);
> + return ret;
> }
>
> static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
> index ccd635b812b5..bc8e07186a85 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
> @@ -420,10 +420,13 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
> unsigned long flags, end_jiffies;
> int retry;
> struct vi_mqd *m = get_mqd(mqd);
> + int ret = 0;
>
> if (adev->in_gpu_reset)
> return -EIO;
>
> + down_read(&adev->reset_sem);
> +
> acquire_queue(kgd, pipe_id, queue_id);
>
> if (m->cp_hqd_vmid == 0)
> @@ -504,14 +507,16 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
> break;
> if (time_after(jiffies, end_jiffies)) {
> pr_err("cp queue preemption time out.\n");
> - release_queue(kgd);
> - return -ETIME;
> + ret = -ETIME;
> + goto pro_end;
> }
> usleep_range(500, 1000);
> }
>
> +pro_end:
> release_queue(kgd);
> - return 0;
> + up_read(&adev->reset_sem);
> + return ret;
> }
>
> static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> index df841c2ac5e7..341ad652d910 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> @@ -540,10 +540,13 @@ int kgd_gfx_v9_hqd_destroy(struct kgd_dev *kgd, void *mqd,
> unsigned long end_jiffies;
> uint32_t temp;
> struct v9_mqd *m = get_mqd(mqd);
> + int ret = 0;
>
> if (adev->in_gpu_reset)
> return -EIO;
>
> + down_read(&adev->reset_sem);
> +
> acquire_queue(kgd, pipe_id, queue_id);
>
> if (m->cp_hqd_vmid == 0)
> @@ -570,14 +573,16 @@ int kgd_gfx_v9_hqd_destroy(struct kgd_dev *kgd, void *mqd,
> break;
> if (time_after(jiffies, end_jiffies)) {
> pr_err("cp queue preemption time out.\n");
> - release_queue(kgd);
> - return -ETIME;
> + ret = -ETIME;
> + goto pro_end;
> }
> usleep_range(500, 1000);
> }
>
> +pro_end:
> release_queue(kgd);
> - return 0;
> + up_read(&adev->reset_sem);
> + return ret;
> }
>
> static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> index aeada7c9fbea..d8f264c47b86 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> @@ -100,14 +100,14 @@ static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file)
>
> file->private_data = adev;
>
> - mutex_lock(&adev->lock_reset);
> + down_read(&adev->reset_sem);
> if (adev->autodump.dumping.done) {
> reinit_completion(&adev->autodump.dumping);
> ret = 0;
> } else {
> ret = -EBUSY;
> }
> - mutex_unlock(&adev->lock_reset);
> + up_read(&adev->reset_sem);
>
> return ret;
> }
> @@ -1188,7 +1188,7 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data)
> }
>
> /* Avoid accidently unparking the sched thread during GPU reset */
> - mutex_lock(&adev->lock_reset);
> + down_read(&adev->reset_sem);
>
> /* hold on the scheduler */
> for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
> @@ -1215,7 +1215,7 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data)
> kthread_unpark(ring->sched.thread);
> }
>
> - mutex_unlock(&adev->lock_reset);
> + up_read(&adev->reset_sem);
>
> pm_runtime_mark_last_busy(dev->dev);
> pm_runtime_put_autosuspend(dev->dev);
> @@ -1395,7 +1395,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
> return -ENOMEM;
>
> /* Avoid accidently unparking the sched thread during GPU reset */
> - mutex_lock(&adev->lock_reset);
> + down_read(&adev->reset_sem);
>
> /* stop the scheduler */
> kthread_park(ring->sched.thread);
> @@ -1436,7 +1436,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
> /* restart the scheduler */
> kthread_unpark(ring->sched.thread);
>
> - mutex_unlock(&adev->lock_reset);
> + up_read(&adev->reset_sem);
>
> ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched);
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 2858c09fd8c0..bc902c59c1c0 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3044,7 +3044,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
> mutex_init(&adev->mn_lock);
> mutex_init(&adev->virt.vf_errors.lock);
> hash_init(adev->mn_hash);
> - mutex_init(&adev->lock_reset);
> + init_rwsem(&adev->reset_sem);
> mutex_init(&adev->psp.mutex);
> mutex_init(&adev->notifier_lock);
>
> @@ -4150,10 +4150,10 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
> static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock)
> {
> if (trylock) {
> - if (!mutex_trylock(&adev->lock_reset))
> + if (!down_write_trylock(&adev->reset_sem))
> return false;
> } else
> - mutex_lock(&adev->lock_reset);
> + down_write(&adev->reset_sem);
>
> atomic_inc(&adev->gpu_reset_counter);
> adev->in_gpu_reset = true;
> @@ -4177,7 +4177,7 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
> amdgpu_vf_error_trans_all(adev);
> adev->mp1_state = PP_MP1_STATE_NONE;
> adev->in_gpu_reset = false;
> - mutex_unlock(&adev->lock_reset);
> + up_write(&adev->reset_sem);
> }
>
> static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index 2975c4a6e581..d4c69f9577a4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -225,8 +225,10 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
> if (finished->error < 0) {
> DRM_INFO("Skip scheduling IBs!\n");
> } else {
> + down_read(&ring->adev->reset_sem);
> r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
> &fence);
> + up_read(&ring->adev->reset_sem);
> if (r)
> DRM_ERROR("Error scheduling IBs (%d)\n", r);
> }
> @@ -237,6 +239,7 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
> amdgpu_job_free_resources(job);
>
> fence = r ? ERR_PTR(r) : fence;
> +
> return fence;
> }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> index 13ea8ebc421c..38a751f7d889 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> @@ -121,6 +121,7 @@ void amdgpu_ring_generic_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib)
> void amdgpu_ring_commit(struct amdgpu_ring *ring)
> {
> uint32_t count;
> + struct amdgpu_device *adev = ring->adev;
>
> /* We pad to match fetch size */
> count = ring->funcs->align_mask + 1 -
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> index 5fd67e1cc2a0..97f33540aa02 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
> @@ -244,11 +244,11 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
> * otherwise the mailbox msg will be ruined/reseted by
> * the VF FLR.
> *
> - * we can unlock the lock_reset to allow "amdgpu_job_timedout"
> + * we can unlock the reset_sem to allow "amdgpu_job_timedout"
> * to run gpu_recover() after FLR_NOTIFICATION_CMPL received
> * which means host side had finished this VF's FLR.
> */
> - locked = mutex_trylock(&adev->lock_reset);
> + locked = down_write_trylock(&adev->reset_sem);
> if (locked)
> adev->in_gpu_reset = true;
>
> @@ -263,7 +263,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
> flr_done:
> if (locked) {
> adev->in_gpu_reset = false;
> - mutex_unlock(&adev->lock_reset);
> + up_write(&adev->reset_sem);
> }
>
> /* Trigger recovery for world switch failure if no TDR */
> diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> index ce2bf1fb79ed..484d61c22396 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
> @@ -265,11 +265,11 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
> * otherwise the mailbox msg will be ruined/reseted by
> * the VF FLR.
> *
> - * we can unlock the lock_reset to allow "amdgpu_job_timedout"
> + * we can unlock the reset_sem to allow "amdgpu_job_timedout"
> * to run gpu_recover() after FLR_NOTIFICATION_CMPL received
> * which means host side had finished this VF's FLR.
> */
> - locked = mutex_trylock(&adev->lock_reset);
> + locked = down_write_trylock(&adev->reset_sem);
> if (locked)
> adev->in_gpu_reset = true;
>
> @@ -284,7 +284,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
> flr_done:
> if (locked) {
> adev->in_gpu_reset = false;
> - mutex_unlock(&adev->lock_reset);
> + up_write(&adev->reset_sem);
> }
>
> /* Trigger recovery for world switch failure if no TDR */
More information about the amd-gfx
mailing list