[PATCH v3 4/5] drm/amdgpu: address remove from fault filter

Sat Apr 24 04:50:20 UTC 2021

Am 2021-04-23 um 12:28 p.m. schrieb Christian König:
>
>
> Am 23.04.21 um 17:35 schrieb Philip Yang:
>> Add interface to remove address from fault filter ring by resetting
>> fault ring entry key, then future vm fault on the address will be
>> processed to recover.
>>
>> Use spinlock to protect fault hash ring access by interrupt handler and
>> interrupt scheduled deferred work for vg20.
>>
>> Signed-off-by: Philip Yang <Philip.Yang at amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 66 +++++++++++++++++++++++--
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  3 ++
>>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c  |  1 +
>>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  1 +
>>   4 files changed, 68 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
>> index c39ed9eb0987..91106b59389f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
>> @@ -332,6 +332,17 @@ void amdgpu_gmc_agp_location(struct
>> amdgpu_device *adev, struct amdgpu_gmc *mc)
>>               mc->agp_size >> 20, mc->agp_start, mc->agp_end);
>>   }
>>   +/**
>> + * fault_key - get 52bit hask key from vm fault address and pasid
>> + *
>> + * @addr: 48bit physical address
>> + * @pasid: 4 bit
>> + */
>> +static inline uint64_t fault_key(uint64_t addr, uint16_t pasid)
>
> Please prefix with amdgpu_gmc_
>
>> +{
>> +    return addr << 4 | pasid;
>> +}
>> +
>>   /**
>>    * amdgpu_gmc_filter_faults - filter VM faults
>>    *
>> @@ -349,15 +360,20 @@ bool amdgpu_gmc_filter_faults(struct
>> amdgpu_device *adev, uint64_t addr,
>>   {
>>       struct amdgpu_gmc *gmc = &adev->gmc;
>>   -    uint64_t stamp, key = addr << 4 | pasid;
>> +    uint64_t stamp, key = fault_key(addr, pasid);
>>       struct amdgpu_gmc_fault *fault;
>> +    unsigned long flags;
>>       uint32_t hash;
>>         /* If we don't have space left in the ring buffer return
>> immediately */
>>       stamp = max(timestamp, AMDGPU_GMC_FAULT_TIMEOUT + 1) -
>>           AMDGPU_GMC_FAULT_TIMEOUT;
>> -    if (gmc->fault_ring[gmc->last_fault].timestamp >= stamp)
>> +
>> +    spin_lock_irqsave(&gmc->fault_lock, flags);
>
> Uff the spinlock adds quite some overhead here. I'm still wondering if
> we can't somehow avoid this.

Does it? I guess the irqsave version has overhead because it needs to
disable and reenable interrupts. Other than that, spin locks should be
fairly low overhead, especially in the common (uncontested) case.

If we want to use atomic ops to update the key, it has to be a uint64_t,
not a bitfield. Maybe we could steal 8 bits for the "next" field from
the timestamp instead.

Regards,
  Felix

>
> Christian.
>
>> +    if (gmc->fault_ring[gmc->last_fault].timestamp >= stamp) {
>> +        spin_unlock_irqrestore(&gmc->fault_lock, flags);
>>           return true;
>> +    }
>>         /* Try to find the fault in the hash */
>>       hash = hash_64(key, AMDGPU_GMC_FAULT_HASH_ORDER);
>> @@ -365,8 +381,10 @@ bool amdgpu_gmc_filter_faults(struct
>> amdgpu_device *adev, uint64_t addr,
>>       while (fault->timestamp >= stamp) {
>>           uint64_t tmp;
>>   -        if (fault->key == key)
>> +        if (fault->key == key) {
>> +            spin_unlock_irqrestore(&gmc->fault_lock, flags);
>>               return true;
>> +        }
>>             tmp = fault->timestamp;
>>           fault = &gmc->fault_ring[fault->next];
>> @@ -384,9 +402,51 @@ bool amdgpu_gmc_filter_faults(struct
>> amdgpu_device *adev, uint64_t addr,
>>       /* And update the hash */
>>       fault->next = gmc->fault_hash[hash].idx;
>>       gmc->fault_hash[hash].idx = gmc->last_fault++;
>> +    spin_unlock_irqrestore(&gmc->fault_lock, flags);
>>       return false;
>>   }
>>   +/**
>> + * amdgpu_gmc_filter_faults_remove - remove address from VM faults
>> filter
>> + *
>> + * @adev: amdgpu device structure
>> + * @addr: address of the VM fault
>> + * @pasid: PASID of the process causing the fault
>> + *
>> + * Remove the address from fault filter, then future vm fault on
>> this address
>> + * will pass to retry fault handler to recover.
>> + */
>> +void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev,
>> uint64_t addr,
>> +                     uint16_t pasid)
>> +{
>> +    struct amdgpu_gmc *gmc = &adev->gmc;
>> +
>> +    uint64_t key = fault_key(addr, pasid);
>> +    struct amdgpu_gmc_fault *fault;
>> +    unsigned long flags;
>> +    uint32_t hash;
>> +
>> +    spin_lock_irqsave(&gmc->fault_lock, flags);
>> +    hash = hash_64(key, AMDGPU_GMC_FAULT_HASH_ORDER);
>> +    fault = &gmc->fault_ring[gmc->fault_hash[hash].idx];
>> +    while (true) {
>> +        uint64_t tmp;
>> +
>> +        if (fault->key == key) {
>> +            fault->key = fault_key(0, 0);
>> +            break;
>> +        }
>> +
>> +        tmp = fault->timestamp;
>> +        fault = &gmc->fault_ring[fault->next];
>> +
>> +        /* Check if the entry was reused */
>> +        if (fault->timestamp >= tmp)
>> +            break;
>> +    }
>> +    spin_unlock_irqrestore(&gmc->fault_lock, flags);
>> +}
>> +
>>   int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
>>   {
>>       int r;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>> index 9d11c02a3938..0aae3bd01bf2 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>> @@ -246,6 +246,7 @@ struct amdgpu_gmc {
>>           uint64_t    idx:AMDGPU_GMC_FAULT_RING_ORDER;
>>       } fault_hash[AMDGPU_GMC_FAULT_HASH_SIZE];
>>       uint64_t        last_fault:AMDGPU_GMC_FAULT_RING_ORDER;
>> +    spinlock_t        fault_lock;
>>         bool tmz_enabled;
>>   @@ -318,6 +319,8 @@ void amdgpu_gmc_agp_location(struct
>> amdgpu_device *adev,
>>                    struct amdgpu_gmc *mc);
>>   bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t
>> addr,
>>                     uint16_t pasid, uint64_t timestamp);
>> +void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev,
>> uint64_t addr,
>> +                     uint16_t pasid);
>>   int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev);
>>   void amdgpu_gmc_ras_fini(struct amdgpu_device *adev);
>>   int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device *adev);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> index 498b28a35f5b..7416ad874652 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> @@ -839,6 +839,7 @@ static int gmc_v10_0_sw_init(void *handle)
>>       adev->mmhub.funcs->init(adev);
>>         spin_lock_init(&adev->gmc.invalidate_lock);
>> +    spin_lock_init(&adev->gmc.fault_lock);
>>         if ((adev->flags & AMD_IS_APU) && amdgpu_emu_mode == 1) {
>>           adev->gmc.vram_type = AMDGPU_VRAM_TYPE_DDR4;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> index 4da8b3d28af2..3290b259a372 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> @@ -1444,6 +1444,7 @@ static int gmc_v9_0_sw_init(void *handle)
>>       adev->mmhub.funcs->init(adev);
>>         spin_lock_init(&adev->gmc.invalidate_lock);
>> +    spin_lock_init(&adev->gmc.fault_lock);
>>         r = amdgpu_atomfirmware_get_vram_info(adev,
>>           &vram_width, &vram_type, &vram_vendor);
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx