[RFC PATCH 1/2] drm/amdkfd: Move TLB flushing logic into amdgpu

Mon Mar 13 13:21:07 UTC 2023

Am 2023-03-13 um 03:36 schrieb Christian König:
> Am 10.03.23 um 23:16 schrieb Felix Kuehling:
>> This will make it possible for amdgpu GEM ioctls to flush TLBs on 
>> compute
>> VMs.
>>
>> This removes VMID-based TLB flushing and always uses PASID-based
>> flushing. This still works because it scans the VMID-PASID mapping
>> registers to find the right VMID. It's only slightly less efficient. 
>> This
>> is not a production use case.
>
> On the one hand it looks nice that we can now flush based on the pasid 
> without having the NO_HWS dependency (I hope that this was intentional).

The intention was to be able to trigger PASID TLB flushes from AMDGPU. I 
need it for flushing compute VMs when their mappings are modified by the 
GEM VA ioctl. Removing the NO_HWS dependency was a useful simplification 
to get there.

>
> On the other hand I really don't like to have any variables in the 
> amdgpu_vm structure which are not worked with by the VM code itself.

So what's the solution? Move that code into amdgpu_vm.c? Or find a 
different data structure to store the sequence number? I guess I could 
put it in struct amdgpu_fpriv, which amdgpu_vm is part of.

>
> That already backfired with the pd_phys_addr before.

Can you clarify this concern? I'm not aware of any issues with pd_phys_addr.

Thanks,
   Felix

>
> Regards,
> Christian.
>
>>
>> Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 30 +++++++++-------------
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h |  6 ++---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h     |  1 +
>>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h      |  9 +++++--
>>   drivers/gpu/drm/amd/amdkfd/kfd_process.c   | 28 --------------------
>>   5 files changed, 22 insertions(+), 52 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> index 8816853e50c0..dcbd28e99b5c 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
>> @@ -726,31 +726,25 @@ bool amdgpu_amdkfd_is_kfd_vmid(struct 
>> amdgpu_device *adev, u32 vmid)
>>       return false;
>>   }
>>   -int amdgpu_amdkfd_flush_gpu_tlb_vmid(struct amdgpu_device *adev,
>> -                     uint16_t vmid)
>> -{
>> -    if (adev->family == AMDGPU_FAMILY_AI) {
>> -        int i;
>> -
>> -        for (i = 0; i < adev->num_vmhubs; i++)
>> -            amdgpu_gmc_flush_gpu_tlb(adev, vmid, i, 0);
>> -    } else {
>> -        amdgpu_gmc_flush_gpu_tlb(adev, vmid, AMDGPU_GFXHUB_0, 0);
>> -    }
>> -
>> -    return 0;
>> -}
>> -
>> -int amdgpu_amdkfd_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>> -                      uint16_t pasid, enum TLB_FLUSH_TYPE flush_type)
>> +int amdgpu_amdkfd_flush_tlb(struct amdgpu_device *adev, struct 
>> amdgpu_vm *vm,
>> +                enum TLB_FLUSH_TYPE type)
>>   {
>> +    uint64_t tlb_seq = amdgpu_vm_tlb_seq(vm);
>>       bool all_hub = false;
>>   +    /*
>> +     * It can be that we race and lose here, but that is extremely 
>> unlikely
>> +     * and the worst thing which could happen is that we flush the 
>> changes
>> +     * into the TLB once more which is harmless.
>> +     */
>> +    if (atomic64_xchg(&vm->kfd_last_flushed_seq, tlb_seq) == tlb_seq)
>> +        return 0;
>> +
>>       if (adev->family == AMDGPU_FAMILY_AI ||
>>           adev->family == AMDGPU_FAMILY_RV)
>>           all_hub = true;
>>   -    return amdgpu_gmc_flush_gpu_tlb_pasid(adev, pasid, flush_type, 
>> all_hub);
>> +    return amdgpu_gmc_flush_gpu_tlb_pasid(adev, vm->pasid, type, 
>> all_hub);
>>   }
>>     bool amdgpu_amdkfd_have_atomics_support(struct amdgpu_device *adev)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> index 01ba3589b60a..dcaf69fd833c 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> @@ -157,10 +157,8 @@ int amdgpu_amdkfd_submit_ib(struct amdgpu_device 
>> *adev,
>>                   uint32_t *ib_cmd, uint32_t ib_len);
>>   void amdgpu_amdkfd_set_compute_idle(struct amdgpu_device *adev, 
>> bool idle);
>>   bool amdgpu_amdkfd_have_atomics_support(struct amdgpu_device *adev);
>> -int amdgpu_amdkfd_flush_gpu_tlb_vmid(struct amdgpu_device *adev,
>> -                uint16_t vmid);
>> -int amdgpu_amdkfd_flush_gpu_tlb_pasid(struct amdgpu_device *adev,
>> -                uint16_t pasid, enum TLB_FLUSH_TYPE flush_type);
>> +int amdgpu_amdkfd_flush_tlb(struct amdgpu_device *adev, struct 
>> amdgpu_vm *vm,
>> +                enum TLB_FLUSH_TYPE type);
>>     bool amdgpu_amdkfd_is_kfd_vmid(struct amdgpu_device *adev, u32 
>> vmid);
>>   diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> index 03a3314e5b43..171de7da2959 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> @@ -290,6 +290,7 @@ struct amdgpu_vm {
>>       /* Last finished delayed update */
>>       atomic64_t        tlb_seq;
>>       struct dma_fence    *last_tlb_flush;
>> +    atomic64_t        kfd_last_flushed_seq;
>>         /* Last unlocked submission to the scheduler entities */
>>       struct dma_fence    *last_unlocked;
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> index bfa30d12406b..e029129308e7 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> @@ -715,7 +715,6 @@ struct kfd_process_device {
>>       /* VM context for GPUVM allocations */
>>       struct file *drm_file;
>>       void *drm_priv;
>> -    atomic64_t tlb_seq;
>>         /* GPUVM allocations storage */
>>       struct idr alloc_idr;
>> @@ -1344,7 +1343,13 @@ void kfd_signal_reset_event(struct kfd_dev *dev);
>>     void kfd_signal_poison_consumed_event(struct kfd_dev *dev, u32 
>> pasid);
>>   -void kfd_flush_tlb(struct kfd_process_device *pdd, enum 
>> TLB_FLUSH_TYPE type);
>> +static inline void kfd_flush_tlb(struct kfd_process_device 
>> *pdd,                             enum TLB_FLUSH_TYPE type)
>> +{
>> +    struct amdgpu_device *adev = pdd->dev->adev;
>> +    struct amdgpu_vm *vm = drm_priv_to_vm(pdd->drm_priv);
>> +
>> +    amdgpu_amdkfd_flush_tlb(adev, vm, type);
>> +}
>>     static inline bool kfd_flush_tlb_after_unmap(struct kfd_dev *dev)
>>   {
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> index ebabe92f7edb..48d7d30eeb24 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> @@ -1591,7 +1591,6 @@ int kfd_process_device_init_vm(struct 
>> kfd_process_device *pdd,
>>           return ret;
>>       }
>>       pdd->drm_priv = drm_file->private_data;
>> -    atomic64_set(&pdd->tlb_seq, 0);
>>         ret = kfd_process_device_reserve_ib_mem(pdd);
>>       if (ret)
>> @@ -1994,33 +1993,6 @@ int kfd_reserved_mem_mmap(struct kfd_dev *dev, 
>> struct kfd_process *process,
>>                      KFD_CWSR_TBA_TMA_SIZE, vma->vm_page_prot);
>>   }
>>   -void kfd_flush_tlb(struct kfd_process_device *pdd, enum 
>> TLB_FLUSH_TYPE type)
>> -{
>> -    struct amdgpu_vm *vm = drm_priv_to_vm(pdd->drm_priv);
>> -    uint64_t tlb_seq = amdgpu_vm_tlb_seq(vm);
>> -    struct kfd_dev *dev = pdd->dev;
>> -
>> -    /*
>> -     * It can be that we race and lose here, but that is extremely 
>> unlikely
>> -     * and the worst thing which could happen is that we flush the 
>> changes
>> -     * into the TLB once more which is harmless.
>> -     */
>> -    if (atomic64_xchg(&pdd->tlb_seq, tlb_seq) == tlb_seq)
>> -        return;
>> -
>> -    if (dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) {
>> -        /* Nothing to flush until a VMID is assigned, which
>> -         * only happens when the first queue is created.
>> -         */
>> -        if (pdd->qpd.vmid)
>> -            amdgpu_amdkfd_flush_gpu_tlb_vmid(dev->adev,
>> -                            pdd->qpd.vmid);
>> -    } else {
>> -        amdgpu_amdkfd_flush_gpu_tlb_pasid(dev->adev,
>> -                    pdd->process->pasid, type);
>> -    }
>> -}
>> -
>>   struct kfd_process_device *kfd_process_device_data_by_id(struct 
>> kfd_process *p, uint32_t gpu_id)
>>   {
>>       int i;
>