[PATCH v5 4/5] drm/amdgpu: address remove from fault filter

Wed Apr 28 06:54:48 UTC 2021

Am 27.04.21 um 20:21 schrieb Felix Kuehling:
> On 2021-04-27 10:51 a.m., Philip Yang wrote:
>> Add interface to remove address from fault filter ring by resetting
>> fault ring entry key, then future vm fault on the address will be
>> processed to recover.
>>
>> Define fault key as atomic64_t type to use atomic read/set/cmpxchg key
>> to protect fault ring access by interrupt handler and interrupt deferred
>> work for vg20. Change fault->timestamp to 48-bit to share same uint64_t
>> with 8-bit fault->next, it is enough for 48bit IH timestamp.
>>
>> Signed-off-by: Philip Yang <Philip.Yang at amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 48 ++++++++++++++++++++++---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  6 ++--
>>   2 files changed, 48 insertions(+), 6 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
>> index c39ed9eb0987..a2e81e913abb 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
>> @@ -332,6 +332,17 @@ void amdgpu_gmc_agp_location(struct 
>> amdgpu_device *adev, struct amdgpu_gmc *mc)
>>               mc->agp_size >> 20, mc->agp_start, mc->agp_end);
>>   }
>>   +/**
>> + * amdgpu_gmc_fault_key - get hask key from vm fault address and pasid
>> + *
>> + * @addr: 48bit physical address
>> + * @pasid: 4 bit
>
> This comment is misleading. The PASID is not 4-bit. It's 16-bit. But 
> shifting the address by 4 bit is sufficient because the address is 
> page-aligned, which means the low 12 bits are 0. So this would be more 
> accurate:
>
> @addr: 48 bit physical address, page aligned (36 significant bits)
> @pasid: 16 bit process address space identifier
>
> With that fixed, the patch is
>
> Reviewed-by: Felix Kuehling <Felix.Kuehling at amd.com>
>
>
>> + */
>> +static inline uint64_t amdgpu_gmc_fault_key(uint64_t addr, uint16_t 
>> pasid)
>> +{
>> +    return addr << 4 | pasid;
>> +}

Maybe changing this so that we have "addr * ((1 << 16) / 
AMDGPU_PAGE_SIZE) | pasid" would help to better document that?

Christian.

>> +
>>   /**
>>    * amdgpu_gmc_filter_faults - filter VM faults
>>    *
>> @@ -348,8 +359,7 @@ bool amdgpu_gmc_filter_faults(struct 
>> amdgpu_device *adev, uint64_t addr,
>>                     uint16_t pasid, uint64_t timestamp)
>>   {
>>       struct amdgpu_gmc *gmc = &adev->gmc;
>> -
>> -    uint64_t stamp, key = addr << 4 | pasid;
>> +    uint64_t stamp, key = amdgpu_gmc_fault_key(addr, pasid);
>>       struct amdgpu_gmc_fault *fault;
>>       uint32_t hash;
>>   @@ -365,7 +375,7 @@ bool amdgpu_gmc_filter_faults(struct 
>> amdgpu_device *adev, uint64_t addr,
>>       while (fault->timestamp >= stamp) {
>>           uint64_t tmp;
>>   -        if (fault->key == key)
>> +        if (atomic64_read(&fault->key) == key)
>>               return true;
>>             tmp = fault->timestamp;
>> @@ -378,7 +388,7 @@ bool amdgpu_gmc_filter_faults(struct 
>> amdgpu_device *adev, uint64_t addr,
>>         /* Add the fault to the ring */
>>       fault = &gmc->fault_ring[gmc->last_fault];
>> -    fault->key = key;
>> +    atomic64_set(&fault->key, key);
>>       fault->timestamp = timestamp;
>>         /* And update the hash */
>> @@ -387,6 +397,36 @@ bool amdgpu_gmc_filter_faults(struct 
>> amdgpu_device *adev, uint64_t addr,
>>       return false;
>>   }
>>   +/**
>> + * amdgpu_gmc_filter_faults_remove - remove address from VM faults 
>> filter
>> + *
>> + * @adev: amdgpu device structure
>> + * @addr: address of the VM fault
>> + * @pasid: PASID of the process causing the fault
>> + *
>> + * Remove the address from fault filter, then future vm fault on 
>> this address
>> + * will pass to retry fault handler to recover.
>> + */
>> +void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, 
>> uint64_t addr,
>> +                     uint16_t pasid)
>> +{
>> +    struct amdgpu_gmc *gmc = &adev->gmc;
>> +    uint64_t key = amdgpu_gmc_fault_key(addr, pasid);
>> +    struct amdgpu_gmc_fault *fault;
>> +    uint32_t hash;
>> +    uint64_t tmp;
>> +
>> +    hash = hash_64(key, AMDGPU_GMC_FAULT_HASH_ORDER);
>> +    fault = &gmc->fault_ring[gmc->fault_hash[hash].idx];
>> +    do {
>> +        if (atomic64_cmpxchg(&fault->key, key, 0) == key)
>> +            break;
>> +
>> +        tmp = fault->timestamp;
>> +        fault = &gmc->fault_ring[fault->next];
>> +    } while (fault->timestamp < tmp);
>> +}
>> +
>>   int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
>>   {
>>       int r;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>> index 9d11c02a3938..6aa1d52d3aee 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>> @@ -66,9 +66,9 @@ struct firmware;
>>    * GMC page fault information
>>    */
>>   struct amdgpu_gmc_fault {
>> -    uint64_t    timestamp;
>> +    uint64_t    timestamp:48;
>>       uint64_t    next:AMDGPU_GMC_FAULT_RING_ORDER;
>> -    uint64_t    key:52;
>> +    atomic64_t    key;
>>   };
>>     /*
>> @@ -318,6 +318,8 @@ void amdgpu_gmc_agp_location(struct amdgpu_device 
>> *adev,
>>                    struct amdgpu_gmc *mc);
>>   bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev, uint64_t 
>> addr,
>>                     uint16_t pasid, uint64_t timestamp);
>> +void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, 
>> uint64_t addr,
>> +                     uint16_t pasid);
>>   int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev);
>>   void amdgpu_gmc_ras_fini(struct amdgpu_device *adev);
>>   int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device *adev);
> _______________________________________________
> amd-gfx mailing list
> amd-gfx at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx