[PATCH] drm/amdgpu: capture invalid hardware access v2

Wed Mar 10 03:12:03 UTC 2021

[AMD Official Use Only - Internal Distribution Only]

Hi, Christian,
      Because register r/w functions are also used in ISR, we need add in_irq() check. I updated this change in v3.

Best Regards
Dennis Li
-----Original Message-----
From: Christian König <ckoenig.leichtzumerken at gmail.com>
Sent: Tuesday, March 9, 2021 8:57 PM
To: Li, Dennis <Dennis.Li at amd.com>; amd-gfx at lists.freedesktop.org
Cc: Grodzovsky, Andrey <Andrey.Grodzovsky at amd.com>
Subject: Re: [PATCH] drm/amdgpu: capture invalid hardware access v2

Hi Dennis,

why do you think that this will always assert in reset thread?

In the reset thread while we are holding the reset lock write side
lockdep_assert_held() should be satisfied and not cause any splat in the system log.

Regards,
Christian.

Am 09.03.21 um 03:03 schrieb Li, Dennis:
> [AMD Official Use Only - Internal Distribution Only]
>
> Hi, Christian,
>         amdgpu_device_skip_hw_access will always assert in reset thread, which seems not a good idea.
>
> Best Regards
> Dennis Li
> -----Original Message-----
> From: Christian König <ckoenig.leichtzumerken at gmail.com>
> Sent: Tuesday, March 9, 2021 2:07 AM
> To: amd-gfx at lists.freedesktop.org
> Cc: Grodzovsky, Andrey <Andrey.Grodzovsky at amd.com>; Li, Dennis
> <Dennis.Li at amd.com>
> Subject: [PATCH] drm/amdgpu: capture invalid hardware access v2
>
> From: Dennis Li <Dennis.Li at amd.com>
>
> When recovery thread has begun GPU reset, there should be not other threads to access hardware, otherwise system randomly hang.
>
> v2 (chk): rewritten from scratch, use trylock and lockdep instead of
>    hand wiring the logic.
>
> Signed-off-by: Dennis Li <Dennis.Li at amd.com>
> Signed-off-by: Christian König <christian.koenig at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 74 +++++++++++++++++-----
>   1 file changed, 57 insertions(+), 17 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index e247c3a2ec08..c990af6a43ca 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -326,6 +326,34 @@ void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
>   /*
>    * register access helper functions.
>    */
> +
> +/* Check if hw access should be skipped because of hotplug or device
> +error */ static bool amdgpu_device_skip_hw_access(struct
> +amdgpu_device
> +*adev) {
> +if (adev->in_pci_err_recovery)
> +return true;
> +
> +#ifdef CONFIG_LOCKDEP
> +/*
> + * This is a bit complicated to understand, so worth a comment. What
> +we assert
> + * here is that the GPU reset is not running on another thread in parallel.
> + *
> + * For this we trylock the read side of the reset semaphore, if that
> +succeeds
> + * we know that the reset is not running in paralell.
> + *
> + * If the trylock fails we assert that we are either already holding
> +the read
> + * side of the lock or are the reset thread itself and hold the write
> +side of
> + * the lock.
> + */
> +if (down_read_trylock(&adev->reset_sem))
> +up_read(&adev->reset_sem);
> +else
> +lockdep_assert_held(&adev->reset_sem);
> +#endif
> +
> +return false;
> +}
> +
>   /**
>    * amdgpu_device_rreg - read a memory mapped IO or indirect register
>    *
> @@ -340,7 +368,7 @@ uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,  {
>   uint32_t ret;
>
> -if (adev->in_pci_err_recovery)
> +if (amdgpu_device_skip_hw_access(adev))
>   return 0;
>
>   if ((reg * 4) < adev->rmmio_size) {
> @@ -377,7 +405,7 @@ uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
>    */
>   uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
> { -if (adev->in_pci_err_recovery)
> +if (amdgpu_device_skip_hw_access(adev))
>   return 0;
>
>   if (offset < adev->rmmio_size)
> @@ -402,7 +430,7 @@ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
>    */
>   void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset,
> uint8_t value)  { -if (adev->in_pci_err_recovery)
> +if (amdgpu_device_skip_hw_access(adev))
>   return;
>
>   if (offset < adev->rmmio_size)
> @@ -425,7 +453,7 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,
>   uint32_t reg, uint32_t v,
>   uint32_t acc_flags)
>   {
> -if (adev->in_pci_err_recovery)
> +if (amdgpu_device_skip_hw_access(adev))
>   return;
>
>   if ((reg * 4) < adev->rmmio_size) {
> @@ -452,7 +480,7 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,  void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
>        uint32_t reg, uint32_t v)
>   {
> -if (adev->in_pci_err_recovery)
> +if (amdgpu_device_skip_hw_access(adev))
>   return;
>
>   if (amdgpu_sriov_fullaccess(adev) && @@ -475,7 +503,7 @@ void
> amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
>    */
>   u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)  { -if
> (adev->in_pci_err_recovery)
> +if (amdgpu_device_skip_hw_access(adev))
>   return 0;
>
>   if ((reg * 4) < adev->rio_mem_size)
> @@ -497,7 +525,7 @@ u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
>    */
>   void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)  {
> -if (adev->in_pci_err_recovery)
> +if (amdgpu_device_skip_hw_access(adev))
>   return;
>
>   if ((reg * 4) < adev->rio_mem_size)
> @@ -519,7 +547,7 @@ void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
>    */
>   u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)  {
> -if (adev->in_pci_err_recovery)
> +if (amdgpu_device_skip_hw_access(adev))
>   return 0;
>
>   if (index < adev->doorbell.num_doorbells) { @@ -542,7 +570,7 @@ u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
>    */
>   void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32
> v)  { -if (adev->in_pci_err_recovery)
> +if (amdgpu_device_skip_hw_access(adev))
>   return;
>
>   if (index < adev->doorbell.num_doorbells) { @@ -563,7 +591,7 @@ void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
>    */
>   u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)  {
> -if (adev->in_pci_err_recovery)
> +if (amdgpu_device_skip_hw_access(adev))
>   return 0;
>
>   if (index < adev->doorbell.num_doorbells) { @@ -586,7 +614,7 @@ u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
>    */
>   void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index,
> u64 v)  { -if (adev->in_pci_err_recovery)
> +if (amdgpu_device_skip_hw_access(adev))
>   return;
>
>   if (index < adev->doorbell.num_doorbells) { @@ -610,10 +638,13 @@ u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
>   u32 pcie_index, u32 pcie_data,
>   u32 reg_addr)
>   {
> -unsigned long flags;
> -u32 r;
>   void __iomem *pcie_index_offset;
>   void __iomem *pcie_data_offset;
> +unsigned long flags;
> +u32 r;
> +
> +if (amdgpu_device_skip_hw_access(adev))
> +return 0;
>
>   spin_lock_irqsave(&adev->pcie_idx_lock, flags);
>   pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; @@ -641,10 +672,13 @@ u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
>     u32 pcie_index, u32 pcie_data,
>     u32 reg_addr)
>   {
> -unsigned long flags;
> -u64 r;
>   void __iomem *pcie_index_offset;
>   void __iomem *pcie_data_offset;
> +unsigned long flags;
> +u64 r;
> +
> +if (amdgpu_device_skip_hw_access(adev))
> +return 0;
>
>   spin_lock_irqsave(&adev->pcie_idx_lock, flags);
>   pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; @@ -677,9 +711,12 @@ void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
>    u32 pcie_index, u32 pcie_data,
>    u32 reg_addr, u32 reg_data)
>   {
> -unsigned long flags;
>   void __iomem *pcie_index_offset;
>   void __iomem *pcie_data_offset;
> +unsigned long flags;
> +
> +if (amdgpu_device_skip_hw_access(adev))
> +return;
>
>   spin_lock_irqsave(&adev->pcie_idx_lock, flags);
>   pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; @@ -706,9 +743,12 @@ void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
>      u32 pcie_index, u32 pcie_data,
>      u32 reg_addr, u64 reg_data)
>   {
> -unsigned long flags;
>   void __iomem *pcie_index_offset;
>   void __iomem *pcie_data_offset;
> +unsigned long flags;
> +
> +if (amdgpu_device_skip_hw_access(adev))
> +return;
>
>   spin_lock_irqsave(&adev->pcie_idx_lock, flags);
>   pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
> --
> 2.25.1
>