[PATCH] drm/amdgpu: capture invalid hardware access v2

Tue Mar 9 12:56:59 UTC 2021

Hi Dennis,

why do you think that this will always assert in reset thread?

In the reset thread while we are holding the reset lock write side 
lockdep_assert_held() should be satisfied and not cause any splat in the 
system log.

Regards,
Christian.

Am 09.03.21 um 03:03 schrieb Li, Dennis:
> [AMD Official Use Only - Internal Distribution Only]
>
> Hi, Christian,
>         amdgpu_device_skip_hw_access will always assert in reset thread, which seems not a good idea.
>
> Best Regards
> Dennis Li
> -----Original Message-----
> From: Christian König <ckoenig.leichtzumerken at gmail.com>
> Sent: Tuesday, March 9, 2021 2:07 AM
> To: amd-gfx at lists.freedesktop.org
> Cc: Grodzovsky, Andrey <Andrey.Grodzovsky at amd.com>; Li, Dennis <Dennis.Li at amd.com>
> Subject: [PATCH] drm/amdgpu: capture invalid hardware access v2
>
> From: Dennis Li <Dennis.Li at amd.com>
>
> When recovery thread has begun GPU reset, there should be not other threads to access hardware, otherwise system randomly hang.
>
> v2 (chk): rewritten from scratch, use trylock and lockdep instead of
>    hand wiring the logic.
>
> Signed-off-by: Dennis Li <Dennis.Li at amd.com>
> Signed-off-by: Christian König <christian.koenig at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 74 +++++++++++++++++-----
>   1 file changed, 57 insertions(+), 17 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index e247c3a2ec08..c990af6a43ca 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -326,6 +326,34 @@ void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
>   /*
>    * register access helper functions.
>    */
> +
> +/* Check if hw access should be skipped because of hotplug or device
> +error */ static bool amdgpu_device_skip_hw_access(struct amdgpu_device
> +*adev) {
> +if (adev->in_pci_err_recovery)
> +return true;
> +
> +#ifdef CONFIG_LOCKDEP
> +/*
> + * This is a bit complicated to understand, so worth a comment. What we assert
> + * here is that the GPU reset is not running on another thread in parallel.
> + *
> + * For this we trylock the read side of the reset semaphore, if that succeeds
> + * we know that the reset is not running in paralell.
> + *
> + * If the trylock fails we assert that we are either already holding the read
> + * side of the lock or are the reset thread itself and hold the write side of
> + * the lock.
> + */
> +if (down_read_trylock(&adev->reset_sem))
> +up_read(&adev->reset_sem);
> +else
> +lockdep_assert_held(&adev->reset_sem);
> +#endif
> +
> +return false;
> +}
> +
>   /**
>    * amdgpu_device_rreg - read a memory mapped IO or indirect register
>    *
> @@ -340,7 +368,7 @@ uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,  {
>   uint32_t ret;
>
> -if (adev->in_pci_err_recovery)
> +if (amdgpu_device_skip_hw_access(adev))
>   return 0;
>
>   if ((reg * 4) < adev->rmmio_size) {
> @@ -377,7 +405,7 @@ uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
>    */
>   uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)  {
> -if (adev->in_pci_err_recovery)
> +if (amdgpu_device_skip_hw_access(adev))
>   return 0;
>
>   if (offset < adev->rmmio_size)
> @@ -402,7 +430,7 @@ uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
>    */
>   void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)  {
> -if (adev->in_pci_err_recovery)
> +if (amdgpu_device_skip_hw_access(adev))
>   return;
>
>   if (offset < adev->rmmio_size)
> @@ -425,7 +453,7 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,
>   uint32_t reg, uint32_t v,
>   uint32_t acc_flags)
>   {
> -if (adev->in_pci_err_recovery)
> +if (amdgpu_device_skip_hw_access(adev))
>   return;
>
>   if ((reg * 4) < adev->rmmio_size) {
> @@ -452,7 +480,7 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,  void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
>        uint32_t reg, uint32_t v)
>   {
> -if (adev->in_pci_err_recovery)
> +if (amdgpu_device_skip_hw_access(adev))
>   return;
>
>   if (amdgpu_sriov_fullaccess(adev) &&
> @@ -475,7 +503,7 @@ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
>    */
>   u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)  {
> -if (adev->in_pci_err_recovery)
> +if (amdgpu_device_skip_hw_access(adev))
>   return 0;
>
>   if ((reg * 4) < adev->rio_mem_size)
> @@ -497,7 +525,7 @@ u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
>    */
>   void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)  {
> -if (adev->in_pci_err_recovery)
> +if (amdgpu_device_skip_hw_access(adev))
>   return;
>
>   if ((reg * 4) < adev->rio_mem_size)
> @@ -519,7 +547,7 @@ void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
>    */
>   u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)  {
> -if (adev->in_pci_err_recovery)
> +if (amdgpu_device_skip_hw_access(adev))
>   return 0;
>
>   if (index < adev->doorbell.num_doorbells) { @@ -542,7 +570,7 @@ u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
>    */
>   void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)  {
> -if (adev->in_pci_err_recovery)
> +if (amdgpu_device_skip_hw_access(adev))
>   return;
>
>   if (index < adev->doorbell.num_doorbells) { @@ -563,7 +591,7 @@ void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
>    */
>   u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)  {
> -if (adev->in_pci_err_recovery)
> +if (amdgpu_device_skip_hw_access(adev))
>   return 0;
>
>   if (index < adev->doorbell.num_doorbells) { @@ -586,7 +614,7 @@ u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
>    */
>   void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)  {
> -if (adev->in_pci_err_recovery)
> +if (amdgpu_device_skip_hw_access(adev))
>   return;
>
>   if (index < adev->doorbell.num_doorbells) { @@ -610,10 +638,13 @@ u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
>   u32 pcie_index, u32 pcie_data,
>   u32 reg_addr)
>   {
> -unsigned long flags;
> -u32 r;
>   void __iomem *pcie_index_offset;
>   void __iomem *pcie_data_offset;
> +unsigned long flags;
> +u32 r;
> +
> +if (amdgpu_device_skip_hw_access(adev))
> +return 0;
>
>   spin_lock_irqsave(&adev->pcie_idx_lock, flags);
>   pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; @@ -641,10 +672,13 @@ u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
>     u32 pcie_index, u32 pcie_data,
>     u32 reg_addr)
>   {
> -unsigned long flags;
> -u64 r;
>   void __iomem *pcie_index_offset;
>   void __iomem *pcie_data_offset;
> +unsigned long flags;
> +u64 r;
> +
> +if (amdgpu_device_skip_hw_access(adev))
> +return 0;
>
>   spin_lock_irqsave(&adev->pcie_idx_lock, flags);
>   pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; @@ -677,9 +711,12 @@ void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
>    u32 pcie_index, u32 pcie_data,
>    u32 reg_addr, u32 reg_data)
>   {
> -unsigned long flags;
>   void __iomem *pcie_index_offset;
>   void __iomem *pcie_data_offset;
> +unsigned long flags;
> +
> +if (amdgpu_device_skip_hw_access(adev))
> +return;
>
>   spin_lock_irqsave(&adev->pcie_idx_lock, flags);
>   pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; @@ -706,9 +743,12 @@ void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
>      u32 pcie_index, u32 pcie_data,
>      u32 reg_addr, u64 reg_data)
>   {
> -unsigned long flags;
>   void __iomem *pcie_index_offset;
>   void __iomem *pcie_data_offset;
> +unsigned long flags;
> +
> +if (amdgpu_device_skip_hw_access(adev))
> +return;
>
>   spin_lock_irqsave(&adev->pcie_idx_lock, flags);
>   pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
> --
> 2.25.1
>