[PATCH] drm/amdgpu: fix system hang issue during GPU reset
Dennis Li
Dennis.Li at amd.com
Mon Jul 6 10:01:13 UTC 2020
During GPU reset, driver should hold on all external access to
GPU, otherwise psp will randomly fail to do post, and then cause
system hang.
Signed-off-by: Dennis Li <dennis.li at amd.com>
Change-Id: I7d5d41f9c4198b917d7b49606ba3850988e5b936
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 6c7dd0a707c9..34bfc2a147ff 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -965,7 +965,7 @@ struct amdgpu_device {
bool in_gpu_reset;
enum pp_mp1_state mp1_state;
- struct mutex lock_reset;
+ struct rw_semaphore reset_sem;
struct amdgpu_doorbell_index doorbell_index;
struct mutex notifier_lock;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index ad59ac4423b8..4139c81389a4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -611,7 +611,9 @@ int amdgpu_amdkfd_submit_ib(struct kgd_dev *kgd, enum kgd_engine_type engine,
/* This works for NO_HWS. TODO: need to handle without knowing VMID */
job->vmid = vmid;
+ down_read(&adev->reset_sem);
ret = amdgpu_ib_schedule(ring, 1, ib, job, &f);
+ up_read(&adev->reset_sem);
if (ret) {
DRM_ERROR("amdgpu: failed to schedule IB.\n");
goto err_ib_sched;
@@ -649,6 +651,8 @@ int amdgpu_amdkfd_flush_gpu_tlb_vmid(struct kgd_dev *kgd, uint16_t vmid)
{
struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
+ down_read(&adev->reset_sem);
+
if (adev->family == AMDGPU_FAMILY_AI) {
int i;
@@ -658,6 +662,8 @@ int amdgpu_amdkfd_flush_gpu_tlb_vmid(struct kgd_dev *kgd, uint16_t vmid)
amdgpu_gmc_flush_gpu_tlb(adev, vmid, AMDGPU_GFXHUB_0, 0);
}
+ up_read(&adev->reset_sem);
+
return 0;
}
@@ -666,11 +672,18 @@ int amdgpu_amdkfd_flush_gpu_tlb_pasid(struct kgd_dev *kgd, uint16_t pasid)
struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
const uint32_t flush_type = 0;
bool all_hub = false;
+ int ret = 0;
if (adev->family == AMDGPU_FAMILY_AI)
all_hub = true;
- return amdgpu_gmc_flush_gpu_tlb_pasid(adev, pasid, flush_type, all_hub);
+ down_read(&adev->reset_sem);
+
+ ret = amdgpu_gmc_flush_gpu_tlb_pasid(adev, pasid, flush_type, all_hub);
+
+ up_read(&adev->reset_sem);
+
+ return ret;
}
bool amdgpu_amdkfd_have_atomics_support(struct kgd_dev *kgd)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
index 691c89705bcd..db5d533dd406 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
@@ -542,6 +542,7 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
unsigned long end_jiffies;
uint32_t temp;
struct v10_compute_mqd *m = get_mqd(mqd);
+ int ret = 0;
if (adev->in_gpu_reset)
return -EIO;
@@ -551,6 +552,8 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
int retry;
#endif
+ down_read(&adev->reset_sem);
+
acquire_queue(kgd, pipe_id, queue_id);
if (m->cp_hqd_vmid == 0)
@@ -633,14 +636,16 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
break;
if (time_after(jiffies, end_jiffies)) {
pr_err("cp queue preemption time out.\n");
- release_queue(kgd);
- return -ETIME;
+ ret = -ETIME;
+ goto pro_end;
}
usleep_range(500, 1000);
}
+pro_end:
release_queue(kgd);
- return 0;
+ up_read(&adev->reset_sem);
+ return ret;
}
static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
index 0b7e78748540..cf27fe5091aa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v7.c
@@ -424,10 +424,13 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
enum hqd_dequeue_request_type type;
unsigned long flags, end_jiffies;
int retry;
+ int ret = 0;
if (adev->in_gpu_reset)
return -EIO;
+ down_read(&adev->reset_sem);
+
acquire_queue(kgd, pipe_id, queue_id);
WREG32(mmCP_HQD_PQ_DOORBELL_CONTROL, 0);
@@ -506,14 +509,16 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
break;
if (time_after(jiffies, end_jiffies)) {
pr_err("cp queue preemption time out\n");
- release_queue(kgd);
- return -ETIME;
+ ret = -ETIME;
+ goto pro_end;
}
usleep_range(500, 1000);
}
+pro_end:
release_queue(kgd);
- return 0;
+ up_read(&adev->reset_sem);
+ return ret;
}
static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
index ccd635b812b5..bc8e07186a85 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v8.c
@@ -420,10 +420,13 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
unsigned long flags, end_jiffies;
int retry;
struct vi_mqd *m = get_mqd(mqd);
+ int ret = 0;
if (adev->in_gpu_reset)
return -EIO;
+ down_read(&adev->reset_sem);
+
acquire_queue(kgd, pipe_id, queue_id);
if (m->cp_hqd_vmid == 0)
@@ -504,14 +507,16 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
break;
if (time_after(jiffies, end_jiffies)) {
pr_err("cp queue preemption time out.\n");
- release_queue(kgd);
- return -ETIME;
+ ret = -ETIME;
+ goto pro_end;
}
usleep_range(500, 1000);
}
+pro_end:
release_queue(kgd);
- return 0;
+ up_read(&adev->reset_sem);
+ return ret;
}
static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
index df841c2ac5e7..341ad652d910 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
@@ -540,10 +540,13 @@ int kgd_gfx_v9_hqd_destroy(struct kgd_dev *kgd, void *mqd,
unsigned long end_jiffies;
uint32_t temp;
struct v9_mqd *m = get_mqd(mqd);
+ int ret = 0;
if (adev->in_gpu_reset)
return -EIO;
+ down_read(&adev->reset_sem);
+
acquire_queue(kgd, pipe_id, queue_id);
if (m->cp_hqd_vmid == 0)
@@ -570,14 +573,16 @@ int kgd_gfx_v9_hqd_destroy(struct kgd_dev *kgd, void *mqd,
break;
if (time_after(jiffies, end_jiffies)) {
pr_err("cp queue preemption time out.\n");
- release_queue(kgd);
- return -ETIME;
+ ret = -ETIME;
+ goto pro_end;
}
usleep_range(500, 1000);
}
+pro_end:
release_queue(kgd);
- return 0;
+ up_read(&adev->reset_sem);
+ return ret;
}
static int kgd_hqd_sdma_destroy(struct kgd_dev *kgd, void *mqd,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index aeada7c9fbea..d8f264c47b86 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -100,14 +100,14 @@ static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file)
file->private_data = adev;
- mutex_lock(&adev->lock_reset);
+ down_read(&adev->reset_sem);
if (adev->autodump.dumping.done) {
reinit_completion(&adev->autodump.dumping);
ret = 0;
} else {
ret = -EBUSY;
}
- mutex_unlock(&adev->lock_reset);
+ up_read(&adev->reset_sem);
return ret;
}
@@ -1188,7 +1188,7 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data)
}
/* Avoid accidently unparking the sched thread during GPU reset */
- mutex_lock(&adev->lock_reset);
+ down_read(&adev->reset_sem);
/* hold on the scheduler */
for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
@@ -1215,7 +1215,7 @@ static int amdgpu_debugfs_test_ib(struct seq_file *m, void *data)
kthread_unpark(ring->sched.thread);
}
- mutex_unlock(&adev->lock_reset);
+ up_read(&adev->reset_sem);
pm_runtime_mark_last_busy(dev->dev);
pm_runtime_put_autosuspend(dev->dev);
@@ -1395,7 +1395,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
return -ENOMEM;
/* Avoid accidently unparking the sched thread during GPU reset */
- mutex_lock(&adev->lock_reset);
+ down_read(&adev->reset_sem);
/* stop the scheduler */
kthread_park(ring->sched.thread);
@@ -1436,7 +1436,7 @@ static int amdgpu_debugfs_ib_preempt(void *data, u64 val)
/* restart the scheduler */
kthread_unpark(ring->sched.thread);
- mutex_unlock(&adev->lock_reset);
+ up_read(&adev->reset_sem);
ttm_bo_unlock_delayed_workqueue(&adev->mman.bdev, resched);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 2858c09fd8c0..bc902c59c1c0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3044,7 +3044,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
mutex_init(&adev->mn_lock);
mutex_init(&adev->virt.vf_errors.lock);
hash_init(adev->mn_hash);
- mutex_init(&adev->lock_reset);
+ init_rwsem(&adev->reset_sem);
mutex_init(&adev->psp.mutex);
mutex_init(&adev->notifier_lock);
@@ -4150,10 +4150,10 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock)
{
if (trylock) {
- if (!mutex_trylock(&adev->lock_reset))
+ if (!down_write_trylock(&adev->reset_sem))
return false;
} else
- mutex_lock(&adev->lock_reset);
+ down_write(&adev->reset_sem);
atomic_inc(&adev->gpu_reset_counter);
adev->in_gpu_reset = true;
@@ -4177,7 +4177,7 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
amdgpu_vf_error_trans_all(adev);
adev->mp1_state = PP_MP1_STATE_NONE;
adev->in_gpu_reset = false;
- mutex_unlock(&adev->lock_reset);
+ up_write(&adev->reset_sem);
}
static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 2975c4a6e581..d4c69f9577a4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -225,8 +225,10 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
if (finished->error < 0) {
DRM_INFO("Skip scheduling IBs!\n");
} else {
+ down_read(&ring->adev->reset_sem);
r = amdgpu_ib_schedule(ring, job->num_ibs, job->ibs, job,
&fence);
+ up_read(&ring->adev->reset_sem);
if (r)
DRM_ERROR("Error scheduling IBs (%d)\n", r);
}
@@ -237,6 +239,7 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
amdgpu_job_free_resources(job);
fence = r ? ERR_PTR(r) : fence;
+
return fence;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index 13ea8ebc421c..38a751f7d889 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -121,6 +121,7 @@ void amdgpu_ring_generic_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib)
void amdgpu_ring_commit(struct amdgpu_ring *ring)
{
uint32_t count;
+ struct amdgpu_device *adev = ring->adev;
/* We pad to match fetch size */
count = ring->funcs->align_mask + 1 -
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
index 5fd67e1cc2a0..97f33540aa02 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c
@@ -244,11 +244,11 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
* otherwise the mailbox msg will be ruined/reseted by
* the VF FLR.
*
- * we can unlock the lock_reset to allow "amdgpu_job_timedout"
+ * we can unlock the reset_sem to allow "amdgpu_job_timedout"
* to run gpu_recover() after FLR_NOTIFICATION_CMPL received
* which means host side had finished this VF's FLR.
*/
- locked = mutex_trylock(&adev->lock_reset);
+ locked = down_write_trylock(&adev->reset_sem);
if (locked)
adev->in_gpu_reset = true;
@@ -263,7 +263,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
flr_done:
if (locked) {
adev->in_gpu_reset = false;
- mutex_unlock(&adev->lock_reset);
+ up_write(&adev->reset_sem);
}
/* Trigger recovery for world switch failure if no TDR */
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
index ce2bf1fb79ed..484d61c22396 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c
@@ -265,11 +265,11 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
* otherwise the mailbox msg will be ruined/reseted by
* the VF FLR.
*
- * we can unlock the lock_reset to allow "amdgpu_job_timedout"
+ * we can unlock the reset_sem to allow "amdgpu_job_timedout"
* to run gpu_recover() after FLR_NOTIFICATION_CMPL received
* which means host side had finished this VF's FLR.
*/
- locked = mutex_trylock(&adev->lock_reset);
+ locked = down_write_trylock(&adev->reset_sem);
if (locked)
adev->in_gpu_reset = true;
@@ -284,7 +284,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
flr_done:
if (locked) {
adev->in_gpu_reset = false;
- mutex_unlock(&adev->lock_reset);
+ up_write(&adev->reset_sem);
}
/* Trigger recovery for world switch failure if no TDR */
--
2.17.1
More information about the amd-gfx
mailing list