[PATCH] drm/amdgpu: block hardware accessed by other threads when doing gpu recovery
Christian König
christian.koenig at amd.com
Mon Mar 1 11:52:39 UTC 2021
Adding Andrey as well. Need to wrap my head around that problem once more.
In general the in_irq/in_interrupt macros should not be used in driver
code any more and I'm pretty sure we need to block the access at a
higher level.
Regards,
Christian.
Am 01.03.21 um 12:12 schrieb Dennis Li:
> When GPU recovery thread is doing GPU reset, it is unsafe that other
> threads access hardware concurrently, which could cause GPU reset
> randomly hang.
>
> Signed-off-by: Dennis Li <Dennis.Li at amd.com>
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 1624c2bc8285..c71d3bba5f69 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1033,6 +1033,7 @@ struct amdgpu_device {
> atomic_t in_gpu_reset;
> enum pp_mp1_state mp1_state;
> struct rw_semaphore reset_sem;
> + struct thread_info *recovery_thread;
> struct amdgpu_doorbell_index doorbell_index;
>
> struct mutex notifier_lock;
> @@ -1385,4 +1386,13 @@ static inline int amdgpu_in_reset(struct amdgpu_device *adev)
> {
> return atomic_read(&adev->in_gpu_reset);
> }
> +
> +static inline bool amdgpu_in_recovery_thread(struct amdgpu_device *adev)
> +{
> + if (unlikely(adev->recovery_thread != NULL) &&
> + adev->recovery_thread == current_thread_info())
> + return true;
> +
> + return false;
> +}
> #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 71805dfd9e25..7c17a5468d43 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -401,13 +401,22 @@ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
> */
> void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
> {
> + bool locked;
> +
> if (adev->in_pci_err_recovery)
> return;
>
> + locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq();
> + if (locked)
> + down_read(&adev->reset_sem);
> +
> if (offset < adev->rmmio_size)
> writeb(value, adev->rmmio + offset);
> else
> BUG();
> +
> + if (locked)
> + up_read(&adev->reset_sem);
> }
>
> /**
> @@ -424,15 +433,19 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,
> uint32_t reg, uint32_t v,
> uint32_t acc_flags)
> {
> + bool locked;
> +
> if (adev->in_pci_err_recovery)
> return;
>
> + locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq();
> + if (locked)
> + down_read(&adev->reset_sem);
> +
> if ((reg * 4) < adev->rmmio_size) {
> if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
> - amdgpu_sriov_runtime(adev) &&
> - down_read_trylock(&adev->reset_sem)) {
> + amdgpu_sriov_runtime(adev)) {
> amdgpu_kiq_wreg(adev, reg, v);
> - up_read(&adev->reset_sem);
> } else {
> writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
> }
> @@ -440,6 +453,9 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,
> adev->pcie_wreg(adev, reg * 4, v);
> }
>
> + if (locked)
> + up_read(&adev->reset_sem);
> +
> trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
> }
>
> @@ -451,9 +467,15 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,
> void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
> uint32_t reg, uint32_t v)
> {
> + bool locked;
> +
> if (adev->in_pci_err_recovery)
> return;
>
> + locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq();
> + if (locked)
> + down_read(&adev->reset_sem);
> +
> if (amdgpu_sriov_fullaccess(adev) &&
> adev->gfx.rlc.funcs &&
> adev->gfx.rlc.funcs->is_rlcg_access_range) {
> @@ -462,6 +484,9 @@ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
> } else {
> writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
> }
> +
> + if (locked)
> + up_read(&adev->reset_sem);
> }
>
> /**
> @@ -496,15 +521,24 @@ u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
> */
> void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
> {
> + bool locked;
> +
> if (adev->in_pci_err_recovery)
> return;
>
> + locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq();
> + if (locked)
> + down_read(&adev->reset_sem);
> +
> if ((reg * 4) < adev->rio_mem_size)
> iowrite32(v, adev->rio_mem + (reg * 4));
> else {
> iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
> iowrite32(v, adev->rio_mem + (mmMM_DATA * 4));
> }
> +
> + if (locked)
> + up_read(&adev->reset_sem);
> }
>
> /**
> @@ -679,6 +713,11 @@ void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
> unsigned long flags;
> void __iomem *pcie_index_offset;
> void __iomem *pcie_data_offset;
> + bool locked;
> +
> + locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq();
> + if (locked)
> + down_read(&adev->reset_sem);
>
> spin_lock_irqsave(&adev->pcie_idx_lock, flags);
> pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
> @@ -689,6 +728,9 @@ void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
> writel(reg_data, pcie_data_offset);
> readl(pcie_data_offset);
> spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
> +
> + if (locked)
> + up_read(&adev->reset_sem);
> }
>
> /**
> @@ -708,6 +750,11 @@ void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
> unsigned long flags;
> void __iomem *pcie_index_offset;
> void __iomem *pcie_data_offset;
> + bool locked;
> +
> + locked = likely(!amdgpu_in_recovery_thread(adev)) & !in_irq();
> + if (locked)
> + down_read(&adev->reset_sem);
>
> spin_lock_irqsave(&adev->pcie_idx_lock, flags);
> pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
> @@ -724,6 +771,9 @@ void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
> writel((u32)(reg_data >> 32), pcie_data_offset);
> readl(pcie_data_offset);
> spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
> +
> + if (locked)
> + up_read(&adev->reset_sem);
> }
>
> /**
> @@ -4459,6 +4509,8 @@ static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
> break;
> }
>
> + adev->recovery_thread = current_thread_info();
> +
> return true;
> }
>
> @@ -4467,6 +4519,7 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
> amdgpu_vf_error_trans_all(adev);
> adev->mp1_state = PP_MP1_STATE_NONE;
> atomic_set(&adev->in_gpu_reset, 0);
> + adev->recovery_thread = NULL;
> up_write(&adev->reset_sem);
> }
>
> @@ -4609,7 +4662,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
>
> dev_info(adev->dev, "GPU %s begin!\n",
> need_emergency_restart ? "jobs stop":"reset");
> -
> /*
> * Here we trylock to avoid chain of resets executing from
> * either trigger by jobs on different adevs in XGMI hive or jobs on
More information about the amd-gfx
mailing list