[PATCH] drm/amdgpu: capture invalid hardware access v2
Andrey Grodzovsky
andrey.grodzovsky at amd.com
Tue Mar 9 16:40:45 UTC 2021
Because he takes the write side lock post amdgpu_pre_asic_reset - where
HW suspend sequence happens (touching registers) - so i think it will
assert.
Andrey
On 2021-03-09 7:56 a.m., Christian König wrote:
> Hi Dennis,
>
> why do you think that this will always assert in reset thread?
>
> In the reset thread while we are holding the reset lock write side
> lockdep_assert_held() should be satisfied and not cause any splat in the
> system log.
>
> Regards,
> Christian.
>
> Am 09.03.21 um 03:03 schrieb Li, Dennis:
>> [AMD Official Use Only - Internal Distribution Only]
>>
>> Hi, Christian,
>> amdgpu_device_skip_hw_access will always assert in reset
>> thread, which seems not a good idea.
>>
>> Best Regards
>> Dennis Li
>> -----Original Message-----
>> From: Christian König <ckoenig.leichtzumerken at gmail.com>
>> Sent: Tuesday, March 9, 2021 2:07 AM
>> To: amd-gfx at lists.freedesktop.org
>> Cc: Grodzovsky, Andrey <Andrey.Grodzovsky at amd.com>; Li, Dennis
>> <Dennis.Li at amd.com>
>> Subject: [PATCH] drm/amdgpu: capture invalid hardware access v2
>>
>> From: Dennis Li <Dennis.Li at amd.com>
>>
>> When recovery thread has begun GPU reset, there should be not other
>> threads to access hardware, otherwise system randomly hang.
>>
>> v2 (chk): rewritten from scratch, use trylock and lockdep instead of
>> hand wiring the logic.
>>
>> Signed-off-by: Dennis Li <Dennis.Li at amd.com>
>> Signed-off-by: Christian König <christian.koenig at amd.com>
>> ---
>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 74 +++++++++++++++++-----
>> 1 file changed, 57 insertions(+), 17 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index e247c3a2ec08..c990af6a43ca 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -326,6 +326,34 @@ void amdgpu_device_vram_access(struct
>> amdgpu_device *adev, loff_t pos,
>> /*
>> * register access helper functions.
>> */
>> +
>> +/* Check if hw access should be skipped because of hotplug or device
>> +error */ static bool amdgpu_device_skip_hw_access(struct amdgpu_device
>> +*adev) {
>> +if (adev->in_pci_err_recovery)
>> +return true;
>> +
>> +#ifdef CONFIG_LOCKDEP
>> +/*
>> + * This is a bit complicated to understand, so worth a comment. What
>> we assert
>> + * here is that the GPU reset is not running on another thread in
>> parallel.
>> + *
>> + * For this we trylock the read side of the reset semaphore, if that
>> succeeds
>> + * we know that the reset is not running in paralell.
>> + *
>> + * If the trylock fails we assert that we are either already holding
>> the read
>> + * side of the lock or are the reset thread itself and hold the write
>> side of
>> + * the lock.
>> + */
>> +if (down_read_trylock(&adev->reset_sem))
>> +up_read(&adev->reset_sem);
>> +else
>> +lockdep_assert_held(&adev->reset_sem);
>> +#endif
>> +
>> +return false;
>> +}
>> +
>> /**
>> * amdgpu_device_rreg - read a memory mapped IO or indirect register
>> *
>> @@ -340,7 +368,7 @@ uint32_t amdgpu_device_rreg(struct amdgpu_device
>> *adev, {
>> uint32_t ret;
>>
>> -if (adev->in_pci_err_recovery)
>> +if (amdgpu_device_skip_hw_access(adev))
>> return 0;
>>
>> if ((reg * 4) < adev->rmmio_size) {
>> @@ -377,7 +405,7 @@ uint32_t amdgpu_device_rreg(struct amdgpu_device
>> *adev,
>> */
>> uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) {
>> -if (adev->in_pci_err_recovery)
>> +if (amdgpu_device_skip_hw_access(adev))
>> return 0;
>>
>> if (offset < adev->rmmio_size)
>> @@ -402,7 +430,7 @@ uint8_t amdgpu_mm_rreg8(struct amdgpu_device
>> *adev, uint32_t offset)
>> */
>> void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset,
>> uint8_t value) {
>> -if (adev->in_pci_err_recovery)
>> +if (amdgpu_device_skip_hw_access(adev))
>> return;
>>
>> if (offset < adev->rmmio_size)
>> @@ -425,7 +453,7 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,
>> uint32_t reg, uint32_t v,
>> uint32_t acc_flags)
>> {
>> -if (adev->in_pci_err_recovery)
>> +if (amdgpu_device_skip_hw_access(adev))
>> return;
>>
>> if ((reg * 4) < adev->rmmio_size) {
>> @@ -452,7 +480,7 @@ void amdgpu_device_wreg(struct amdgpu_device
>> *adev, void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
>> uint32_t reg, uint32_t v)
>> {
>> -if (adev->in_pci_err_recovery)
>> +if (amdgpu_device_skip_hw_access(adev))
>> return;
>>
>> if (amdgpu_sriov_fullaccess(adev) &&
>> @@ -475,7 +503,7 @@ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device
>> *adev,
>> */
>> u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg) {
>> -if (adev->in_pci_err_recovery)
>> +if (amdgpu_device_skip_hw_access(adev))
>> return 0;
>>
>> if ((reg * 4) < adev->rio_mem_size)
>> @@ -497,7 +525,7 @@ u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32
>> reg)
>> */
>> void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v) {
>> -if (adev->in_pci_err_recovery)
>> +if (amdgpu_device_skip_hw_access(adev))
>> return;
>>
>> if ((reg * 4) < adev->rio_mem_size)
>> @@ -519,7 +547,7 @@ void amdgpu_io_wreg(struct amdgpu_device *adev,
>> u32 reg, u32 v)
>> */
>> u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index) {
>> -if (adev->in_pci_err_recovery)
>> +if (amdgpu_device_skip_hw_access(adev))
>> return 0;
>>
>> if (index < adev->doorbell.num_doorbells) { @@ -542,7 +570,7 @@ u32
>> amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
>> */
>> void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32
>> v) {
>> -if (adev->in_pci_err_recovery)
>> +if (amdgpu_device_skip_hw_access(adev))
>> return;
>>
>> if (index < adev->doorbell.num_doorbells) { @@ -563,7 +591,7 @@ void
>> amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
>> */
>> u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index) {
>> -if (adev->in_pci_err_recovery)
>> +if (amdgpu_device_skip_hw_access(adev))
>> return 0;
>>
>> if (index < adev->doorbell.num_doorbells) { @@ -586,7 +614,7 @@ u64
>> amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
>> */
>> void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index,
>> u64 v) {
>> -if (adev->in_pci_err_recovery)
>> +if (amdgpu_device_skip_hw_access(adev))
>> return;
>>
>> if (index < adev->doorbell.num_doorbells) { @@ -610,10 +638,13 @@
>> u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
>> u32 pcie_index, u32 pcie_data,
>> u32 reg_addr)
>> {
>> -unsigned long flags;
>> -u32 r;
>> void __iomem *pcie_index_offset;
>> void __iomem *pcie_data_offset;
>> +unsigned long flags;
>> +u32 r;
>> +
>> +if (amdgpu_device_skip_hw_access(adev))
>> +return 0;
>>
>> spin_lock_irqsave(&adev->pcie_idx_lock, flags);
>> pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; @@
>> -641,10 +672,13 @@ u64 amdgpu_device_indirect_rreg64(struct
>> amdgpu_device *adev,
>> u32 pcie_index, u32 pcie_data,
>> u32 reg_addr)
>> {
>> -unsigned long flags;
>> -u64 r;
>> void __iomem *pcie_index_offset;
>> void __iomem *pcie_data_offset;
>> +unsigned long flags;
>> +u64 r;
>> +
>> +if (amdgpu_device_skip_hw_access(adev))
>> +return 0;
>>
>> spin_lock_irqsave(&adev->pcie_idx_lock, flags);
>> pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; @@
>> -677,9 +711,12 @@ void amdgpu_device_indirect_wreg(struct
>> amdgpu_device *adev,
>> u32 pcie_index, u32 pcie_data,
>> u32 reg_addr, u32 reg_data)
>> {
>> -unsigned long flags;
>> void __iomem *pcie_index_offset;
>> void __iomem *pcie_data_offset;
>> +unsigned long flags;
>> +
>> +if (amdgpu_device_skip_hw_access(adev))
>> +return;
>>
>> spin_lock_irqsave(&adev->pcie_idx_lock, flags);
>> pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4; @@
>> -706,9 +743,12 @@ void amdgpu_device_indirect_wreg64(struct
>> amdgpu_device *adev,
>> u32 pcie_index, u32 pcie_data,
>> u32 reg_addr, u64 reg_data)
>> {
>> -unsigned long flags;
>> void __iomem *pcie_index_offset;
>> void __iomem *pcie_data_offset;
>> +unsigned long flags;
>> +
>> +if (amdgpu_device_skip_hw_access(adev))
>> +return;
>>
>> spin_lock_irqsave(&adev->pcie_idx_lock, flags);
>> pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
>> --
>> 2.25.1
>>
>
More information about the amd-gfx
mailing list