[PATCH v2] drm/amdkfd: Have kfd driver use same PASID values from graphic driver

Chen, Xiaogang xiaogang.chen at amd.com
Mon Dec 2 15:05:58 UTC 2024


On 11/28/2024 2:17 PM, Felix Kuehling wrote:
>
> On 2024-11-25 10:39, Xiaogang.Chen wrote:
>> From: Xiaogang Chen <xiaogang.chen at amd.com>
>>
>> Current kfd driver has its own PASID value for a kfd process and uses 
>> it to
>> locate vm at interrupt handler or mapping between kfd process and vm. 
>> That
>> design is not working when a physical gpu device has multiple spatial
>> partitions, ex: adev in CPX mode. This patch has kfd driver use same 
>> pasid
>> values that graphic driver generated which is per vm per pasid.
>>
>> These pasid values are passed to fw/hardware. We do not need change 
>> interrupt
>> handler though more pasid values are used. Also, pasid values at log are
>> replaced by user process pid; pasid values are not exposed to user. 
>> Users see
>> their process pids that have meaning in user space.
>>
>> Signed-off-by: Xiaogang Chen <xiaogang.chen at amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |   5 +-
>>   .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  |  23 ++--
>>   .../gpu/drm/amd/amdkfd/cik_event_interrupt.c  |   9 +-
>>   drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  25 ++--
>>   drivers/gpu/drm/amd/amdkfd/kfd_debug.c        |  14 +--
>>   .../drm/amd/amdkfd/kfd_device_queue_manager.c |  79 ++++++------
>>   drivers/gpu/drm/amd/amdkfd/kfd_events.c       |  40 ++++---
>>   .../gpu/drm/amd/amdkfd/kfd_int_process_v11.c  |   2 +-
>>   .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   |   2 +-
>>   .../drm/amd/amdkfd/kfd_packet_manager_v9.c    |   4 +-
>>   .../drm/amd/amdkfd/kfd_packet_manager_vi.c    |   3 +-
>>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h         |  13 +-
>>   drivers/gpu/drm/amd/amdkfd/kfd_process.c      | 113 ++++++++++--------
>>   .../amd/amdkfd/kfd_process_queue_manager.c    |  10 +-
>>   drivers/gpu/drm/amd/amdkfd/kfd_svm.c          |  19 +--
>>   15 files changed, 193 insertions(+), 168 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> index 4b80ad860639..8a2bb1ee303e 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
>> @@ -47,6 +47,7 @@ enum TLB_FLUSH_TYPE {
>>   };
>>     struct amdgpu_device;
>> +struct kfd_process_device;
>>   struct amdgpu_reset_context;
>>     enum kfd_mem_attachment_type {
>> @@ -299,8 +300,8 @@ bool amdgpu_amdkfd_compute_active(struct 
>> amdgpu_device *adev, uint32_t node_id);
>>       (&((struct amdgpu_fpriv *)                    \
>>           ((struct drm_file *)(drm_priv))->driver_priv)->vm)
>>   -int amdgpu_amdkfd_gpuvm_set_vm_pasid(struct amdgpu_device *adev,
>> -                     struct amdgpu_vm *avm, u32 pasid);
>> +unsigned int amdgpu_amdkfd_gpuvm_get_pasid(struct kfd_process_device 
>> *pdd,
>> +                        struct amdgpu_vm *avm);
>>   int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct amdgpu_device *adev,
>>                       struct amdgpu_vm *avm,
>>                       void **process_info,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> index f30548f4c3b3..0aecd3a4f8ce 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
>> @@ -1529,25 +1529,18 @@ static void 
>> amdgpu_amdkfd_gpuvm_unpin_bo(struct amdgpu_bo *bo)
>>       amdgpu_bo_unreserve(bo);
>>   }
>>   -int amdgpu_amdkfd_gpuvm_set_vm_pasid(struct amdgpu_device *adev,
>> -                     struct amdgpu_vm *avm, u32 pasid)
>> -
>> +unsigned int amdgpu_amdkfd_gpuvm_get_pasid(struct kfd_process_device 
>> *pdd,
>> +                        struct amdgpu_vm *avm)
>>   {
>> -    int ret;
>> -
>> -    /* Free the original amdgpu allocated pasid,
>> -     * will be replaced with kfd allocated pasid.
>> -     */
>> +    /* if avm has pasid assigned use it */
>>       if (avm->pasid) {
>> -        amdgpu_pasid_free(avm->pasid);
>> -        amdgpu_vm_set_pasid(adev, avm, 0);
>> +        pdd->pasid = avm->pasid;
>
> Accessing pdd here is a layering violation. This should be set in the 
> caller in KFD. There is no need to let this function have a side 
> effect on the pdd.
yes, will put pdd assignment at kfd_process_device_init_vm.
>
>
>> +        return pdd->pasid;
>>       }
>>   -    ret = amdgpu_vm_set_pasid(adev, avm, pasid);
>> -    if (ret)
>> -        return ret;
>> -
>> -    return 0;
>> +    dev_warn(pdd->dev->adev->dev, "WARN: vm %p has no pasid 
>> associated",
>> +         avm);
>> +    return -1;
>>   }
>>     int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct amdgpu_device 
>> *adev,
>> diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c 
>> b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
>> index 795382b55e0a..3fcf0383a06a 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
>> @@ -107,10 +107,13 @@ static void cik_event_interrupt_wq(struct 
>> kfd_node *dev,
>>           kfd_signal_hw_exception_event(pasid);
>>       else if (ihre->source_id == CIK_INTSRC_GFX_PAGE_INV_FAULT ||
>>           ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
>> +        struct kfd_process_device * pdd;
>
> There should be no space between * and pdd.
>
note
>
>>           struct kfd_vm_fault_info info;
>>             kfd_smi_event_update_vmfault(dev, pasid);
>> -        kfd_dqm_evict_pasid(dev->dqm, pasid);
>> +
>> +        pdd = kfd_lookup_process_device_by_pasid(pasid);
>
> This function is not safe to call here (see my comments in 
> kfd_process.[ch]). You should use kfd_lookup_process_by_pasid to do 
> the right SRCU locking and process reference counting.
>
> You also need a NULL check here. kfd_evict_process_device has a check 
> (but maybe doesn't need it), but kfd_signal_vm_fault_event does not. 
> If you do process reference counting correctly, you also need to 
> decrement the process ref count in the end, but only if the process is 
> not NULL.
>
yes, need to rearrange process reference counting for this case.
>
>> +        kfd_evict_process_device(pdd);
>>             memset(&info, 0, sizeof(info));
>>           amdgpu_amdkfd_gpuvm_get_vm_fault_info(dev->adev, &info);
>> @@ -118,9 +121,9 @@ static void cik_event_interrupt_wq(struct 
>> kfd_node *dev,
>>               return;
>>             if (info.vmid == vmid)
>> -            kfd_signal_vm_fault_event(dev, pasid, &info, NULL);
>> +            kfd_signal_vm_fault_event(pdd, &info, NULL);
>>           else
>> -            kfd_signal_vm_fault_event(dev, pasid, NULL, NULL);
>> +            kfd_signal_vm_fault_event(pdd, &info, NULL);
>>       }
>>   }
>>   diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> index 065d87841459..8b1fd8e7452b 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
>> @@ -155,8 +155,8 @@ static int kfd_open(struct inode *inode, struct 
>> file *filep)
>>       /* filep now owns the reference returned by kfd_create_process */
>>       filep->private_data = process;
>>   -    dev_dbg(kfd_device, "process %d opened, compat mode (32 bit) - 
>> %d\n",
>> -        process->pasid, process->is_32bit_user_mode);
>> +    dev_dbg(kfd_device, "process pid %d opened kfd node, compat mode 
>> (32 bit) - %d\n",
>> +        (int)process->lead_thread->pid, process->is_32bit_user_mode);
>>         return 0;
>>   }
>> @@ -361,8 +361,8 @@ static int kfd_ioctl_create_queue(struct file 
>> *filep, struct kfd_process *p,
>>           goto err_acquire_queue_buf;
>>       }
>>   -    pr_debug("Creating queue for PASID 0x%x on gpu 0x%x\n",
>> -            p->pasid,
>> +    pr_debug("Creating queue for process pid 0x%x on gpu 0x%x\n",
>> +            (int)p->lead_thread->pid,
>>               dev->id);
>>         err = pqm_create_queue(&p->pqm, dev, &q_properties, &queue_id,
>> @@ -415,9 +415,9 @@ static int kfd_ioctl_destroy_queue(struct file 
>> *filp, struct kfd_process *p,
>>       int retval;
>>       struct kfd_ioctl_destroy_queue_args *args = data;
>>   -    pr_debug("Destroying queue id %d for pasid 0x%x\n",
>> +    pr_debug("Destroying queue id %d for process pid 0x%x\n",
>>                   args->queue_id,
>> -                p->pasid);
>> +                (int)p->lead_thread->pid);
>>         mutex_lock(&p->mutex);
>>   @@ -468,8 +468,8 @@ static int kfd_ioctl_update_queue(struct file 
>> *filp, struct kfd_process *p,
>>       properties.pm4_target_xcc = (args->queue_percentage >> 8) & 0xFF;
>>       properties.priority = args->queue_priority;
>>   -    pr_debug("Updating queue id %d for pasid 0x%x\n",
>> -            args->queue_id, p->pasid);
>> +    pr_debug("Updating queue id %d for process pid 0x%x\n",
>> +            args->queue_id, (int)p->lead_thread->pid);
>>         mutex_lock(&p->mutex);
>>   @@ -695,7 +695,7 @@ static int 
>> kfd_ioctl_get_process_apertures(struct file *filp,
>>       struct kfd_process_device_apertures *pAperture;
>>       int i;
>>   -    dev_dbg(kfd_device, "get apertures for PASID 0x%x", p->pasid);
>> +    dev_dbg(kfd_device, "get apertures for process pid 0x%x", 
>> (int)p->lead_thread->pid);
>>         args->num_of_nodes = 0;
>>   @@ -747,7 +747,8 @@ static int 
>> kfd_ioctl_get_process_apertures_new(struct file *filp,
>>       int ret;
>>       int i;
>>   -    dev_dbg(kfd_device, "get apertures for PASID 0x%x", p->pasid);
>> +    dev_dbg(kfd_device, "get apertures for process pid 0x%x",
>> +            (int)p->lead_thread->pid);
>>         if (args->num_of_nodes == 0) {
>>           /* Return number of nodes, so that user space can alloacate
>> @@ -3365,12 +3366,12 @@ static int kfd_mmio_mmap(struct kfd_node 
>> *dev, struct kfd_process *process,
>>         vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
>>   -    pr_debug("pasid 0x%x mapping mmio page\n"
>> +    pr_debug("process pid 0x%x mapping mmio page\n"
>>            "     target user address == 0x%08llX\n"
>>            "     physical address    == 0x%08llX\n"
>>            "     vm_flags            == 0x%04lX\n"
>>            "     size                == 0x%04lX\n",
>> -         process->pasid, (unsigned long long) vma->vm_start,
>> +         (int)process->lead_thread->pid, (unsigned long long) 
>> vma->vm_start,
>>            address, vma->vm_flags, PAGE_SIZE);
>>         return io_remap_pfn_range(vma,
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
>> index 312dfa84f29f..4d147ad31910 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_debug.c
>> @@ -204,11 +204,12 @@ bool kfd_set_dbg_ev_from_interrupt(struct 
>> kfd_node *dev,
>>                      size_t exception_data_size)
>>   {
>>       struct kfd_process *p;
>> +    struct kfd_process_device * pdd;
>
> No space between * and pdd.
note
>
>
>>       bool signaled_to_debugger_or_runtime = false;
>>   -    p = kfd_lookup_process_by_pasid(pasid);
>> +    p = kfd_lookup_process_by_pasid(pasid, &pdd);
>>   -    if (!p)
>> +    if (!p || !pdd)
>>           return false;
>
> Is there a case where pdd is NULL but p is not NULL? In that case 
> you're leaking a process reference here.

pdd and p are related. If pdd is not null p cannot be null.  Here pasid 
is associated with pdd. kfd_lookup_process_by_pasid finds pdd from 
pasid. If not find pdd both returned pdd and p are null. So we can only 
check pdd here.

>
>
>>         if (!kfd_dbg_ev_raise(trap_mask, p, dev, doorbell_id, true,
>> @@ -238,9 +239,8 @@ bool kfd_set_dbg_ev_from_interrupt(struct 
>> kfd_node *dev,
>>                 mutex_unlock(&p->mutex);
>>           } else if (trap_mask & 
>> KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION)) {
>> -            kfd_dqm_evict_pasid(dev->dqm, p->pasid);
>> -            kfd_signal_vm_fault_event(dev, p->pasid, NULL,
>> -                            exception_data);
>> +            kfd_evict_process_device(pdd);
>> +            kfd_signal_vm_fault_event(pdd, NULL, exception_data);
>>                 signaled_to_debugger_or_runtime = true;
>>           }
>> @@ -276,8 +276,8 @@ int kfd_dbg_send_exception_to_runtime(struct 
>> kfd_process *p,
>>           data = (struct kfd_hsa_memory_exception_data *)
>>                           pdd->vm_fault_exc_data;
>>   -        kfd_dqm_evict_pasid(pdd->dev->dqm, p->pasid);
>> -        kfd_signal_vm_fault_event(pdd->dev, p->pasid, NULL, data);
>> +        kfd_evict_process_device(pdd);
>> +        kfd_signal_vm_fault_event(pdd, NULL, data);
>>           error_reason &= ~KFD_EC_MASK(EC_DEVICE_MEMORY_VIOLATION);
>>       }
>>   diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> index 38c19dc8311d..fd4bca76b0fa 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> @@ -206,7 +206,7 @@ static int add_queue_mes(struct 
>> device_queue_manager *dqm, struct queue *q,
>>           return -EIO;
>>         memset(&queue_input, 0x0, sizeof(struct mes_add_queue_input));
>> -    queue_input.process_id = qpd->pqm->process->pasid;
>> +    queue_input.process_id = pdd->pasid;
>>       queue_input.page_table_base_addr = qpd->page_table_base;
>>       queue_input.process_va_start = 0;
>>       queue_input.process_va_end = adev->vm_manager.max_pfn - 1;
>> @@ -496,6 +496,7 @@ static int allocate_vmid(struct 
>> device_queue_manager *dqm,
>>               struct qcm_process_device *qpd,
>>               struct queue *q)
>>   {
>> +    struct kfd_process_device *pdd = qpd_to_pdd(qpd);
>>       struct device *dev = dqm->dev->adev->dev;
>>       int allocated_vmid = -1, i;
>>   @@ -514,9 +515,9 @@ static int allocate_vmid(struct 
>> device_queue_manager *dqm,
>>         pr_debug("vmid allocated: %d\n", allocated_vmid);
>>   -    dqm->vmid_pasid[allocated_vmid] = q->process->pasid;
>> +    dqm->vmid_pasid[allocated_vmid] = pdd->pasid;
>>   -    set_pasid_vmid_mapping(dqm, q->process->pasid, allocated_vmid);
>> +    set_pasid_vmid_mapping(dqm, pdd->pasid, allocated_vmid);
>>         qpd->vmid = allocated_vmid;
>>       q->properties.vmid = allocated_vmid;
>> @@ -768,6 +769,11 @@ static int dbgdev_wave_reset_wavefronts(struct 
>> kfd_node *dev, struct kfd_process
>>           return -EOPNOTSUPP;
>>       }
>>   +    /* taking the VMID for that process on the safe way using PDD */
>> +    pdd = kfd_get_process_device_data(dev, p);
>> +    if (!pdd)
>> +        return -EFAULT;
>> +
>>       /* Scan all registers in the range ATC_VMID8_PASID_MAPPING ..
>>        * ATC_VMID15_PASID_MAPPING
>>        * to check which VMID the current process is mapped to.
>> @@ -777,23 +783,19 @@ static int dbgdev_wave_reset_wavefronts(struct 
>> kfd_node *dev, struct kfd_process
>>           status = dev->kfd2kgd->get_atc_vmid_pasid_mapping_info
>>                   (dev->adev, vmid, &queried_pasid);
>>   -        if (status && queried_pasid == p->pasid) {
>> -            pr_debug("Killing wave fronts of vmid %d and pasid 0x%x\n",
>> -                    vmid, p->pasid);
>> +        if (status && queried_pasid == pdd->pasid) {
>> +            pr_debug("Killing wave fronts of vmid %d and process pid 
>> 0x%x\n",
>> +                 vmid, p->lead_thread->pid);
>>               break;
>>           }
>>       }
>>         if (vmid > last_vmid_to_scan) {
>> -        dev_err(dev->adev->dev, "Didn't find vmid for pasid 0x%x\n", 
>> p->pasid);
>> +        dev_err(dev->adev->dev, "Didn't find vmid for process pid 
>> 0x%x\n",
>> +            p->lead_thread->pid);
>>           return -EFAULT;
>>       }
>>   -    /* taking the VMID for that process on the safe way using PDD */
>> -    pdd = kfd_get_process_device_data(dev, p);
>> -    if (!pdd)
>> -        return -EFAULT;
>> -
>>       reg_gfx_index.bits.sh_broadcast_writes = 1;
>>       reg_gfx_index.bits.se_broadcast_writes = 1;
>>       reg_gfx_index.bits.instance_broadcast_writes = 1;
>> @@ -1029,8 +1031,8 @@ static int suspend_single_queue(struct 
>> device_queue_manager *dqm,
>>       if (q->properties.is_suspended)
>>           return 0;
>>   -    pr_debug("Suspending PASID %u queue [%i]\n",
>> -            pdd->process->pasid,
>> +    pr_debug("Suspending process pid %u queue [%i]\n",
>> +            pdd->process->lead_thread->pid,
>>               q->properties.queue_id);
>>         is_new = q->properties.exception_status & 
>> KFD_EC_MASK(EC_QUEUE_NEW);
>> @@ -1077,8 +1079,8 @@ static int resume_single_queue(struct 
>> device_queue_manager *dqm,
>>         pdd = qpd_to_pdd(qpd);
>>   -    pr_debug("Restoring from suspend PASID %u queue [%i]\n",
>> -                pdd->process->pasid,
>> +    pr_debug("Restoring from suspend process pid %u queue [%i]\n",
>> +                pdd->process->lead_thread->pid,
>>                   q->properties.queue_id);
>>         q->properties.is_suspended = false;
>> @@ -1111,8 +1113,8 @@ static int evict_process_queues_nocpsch(struct 
>> device_queue_manager *dqm,
>>           goto out;
>>         pdd = qpd_to_pdd(qpd);
>> -    pr_debug_ratelimited("Evicting PASID 0x%x queues\n",
>> -                pdd->process->pasid);
>> +    pr_debug_ratelimited("Evicting process pid 0x%x queues\n",
>> +                pdd->process->lead_thread->pid);
>>         pdd->last_evict_timestamp = get_jiffies_64();
>>       /* Mark all queues as evicted. Deactivate all active queues on
>> @@ -1169,8 +1171,8 @@ static int evict_process_queues_cpsch(struct 
>> device_queue_manager *dqm,
>>       if (!pdd->drm_priv)
>>           goto out;
>>   -    pr_debug_ratelimited("Evicting PASID 0x%x queues\n",
>> -                pdd->process->pasid);
>> +    pr_debug_ratelimited("Evicting process pid 0x%x queues\n",
>> +                pdd->process->lead_thread->pid);
>>         /* Mark all queues as evicted. Deactivate all active queues on
>>        * the qpd.
>> @@ -1228,8 +1230,8 @@ static int 
>> restore_process_queues_nocpsch(struct device_queue_manager *dqm,
>>           goto out;
>>       }
>>   -    pr_debug_ratelimited("Restoring PASID 0x%x queues\n",
>> -                pdd->process->pasid);
>> +    pr_debug_ratelimited("Restoring process pid 0x%x queues\n",
>> +                pdd->process->lead_thread->pid);
>>         /* Update PD Base in QPD */
>>       qpd->page_table_base = pd_base;
>> @@ -1312,8 +1314,8 @@ static int restore_process_queues_cpsch(struct 
>> device_queue_manager *dqm,
>>       if (!pdd->drm_priv)
>>           goto vm_not_acquired;
>>   -    pr_debug_ratelimited("Restoring PASID 0x%x queues\n",
>> -                pdd->process->pasid);
>> +    pr_debug_ratelimited("Restoring process pid 0x%x queues\n",
>> +                 pdd->process->lead_thread->pid);
>>         /* Update PD Base in QPD */
>>       qpd->page_table_base = 
>> amdgpu_amdkfd_gpuvm_get_process_page_dir(pdd->drm_priv);
>> @@ -2102,7 +2104,7 @@ static void set_queue_as_reset(struct 
>> device_queue_manager *dqm, struct queue *q
>>       struct kfd_process_device *pdd = qpd_to_pdd(qpd);
>>         dev_err(dqm->dev->adev->dev, "queue id 0x%0x at pasid 0x%0x 
>> is reset\n",
>> -        q->properties.queue_id, q->process->pasid);
>> +        q->properties.queue_id, pdd->process->lead_thread->pid);
>>         pdd->has_reset_queue = true;
>>       if (q->properties.is_active) {
>> @@ -2936,19 +2938,18 @@ void device_queue_manager_uninit(struct 
>> device_queue_manager *dqm)
>>   int kfd_dqm_suspend_bad_queue_mes(struct kfd_node *knode, u32 
>> pasid, u32 doorbell_id)
>>   {
>>       struct kfd_process_device *pdd;
>> -    struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
>> +    struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, &pdd);
>>       struct device_queue_manager *dqm = knode->dqm;
>>       struct device *dev = dqm->dev->adev->dev;
>>       struct qcm_process_device *qpd;
>>       struct queue *q = NULL;
>>       int ret = 0;
>>   -    if (!p)
>> +    if (!p || !pdd)
>>           return -EINVAL;
>
> If there a case where pdd is NULL but p is not NULL? In that case 
> you'd leak a process reference here.
Same as above.
>
>
>>         dqm_lock(dqm);
>>   -    pdd = kfd_get_process_device_data(dqm->dev, p);
>>       if (pdd) {
>>           qpd = &pdd->qpd;
>>   @@ -3022,24 +3023,26 @@ static int kfd_dqm_evict_pasid_mes(struct 
>> device_queue_manager *dqm,
>>       return ret;
>>   }
>>   -int kfd_dqm_evict_pasid(struct device_queue_manager *dqm, u32 pasid)
>> +int kfd_evict_process_device(struct kfd_process_device *pdd)
>>   {
>> -    struct kfd_process_device *pdd;
>> -    struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
>> +    struct device_queue_manager *dqm;
>> +    struct kfd_process *p;
>>       int ret = 0;
>>   -    if (!p)
>> +    if (!pdd)
>
> Better to check this in the caller.
note.
>
>
>> +        return -EINVAL;
>> +
>> +    p = pdd->process;
>> +    dqm = pdd->dev->dqm;
>> +    if (!p || !dqm)
>>           return -EINVAL;
>> +
>>       WARN(debug_evictions, "Evicting pid %d", p->lead_thread->pid);
>> -    pdd = kfd_get_process_device_data(dqm->dev, p);
>> -    if (pdd) {
>> -        if (dqm->dev->kfd->shared_resources.enable_mes)
>> +
>> +    if (dqm->dev->kfd->shared_resources.enable_mes)
>>               ret = kfd_dqm_evict_pasid_mes(dqm, &pdd->qpd);
>>           else
>>               ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd);
>> -    }
>> -
>> -    kfd_unref_process(p);
>>         return ret;
>>   }
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
>> index ea3792249209..fb5b1862f746 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
>> @@ -727,7 +727,7 @@ void kfd_signal_event_interrupt(u32 pasid, 
>> uint32_t partial_id,
>>        * to process context, kfd_process could attempt to exit while 
>> we are
>>        * running so the lookup function increments the process ref 
>> count.
>>        */
>> -    struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
>> +    struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL);
>>         if (!p)
>>           return; /* Presumably process exited. */
>> @@ -1128,8 +1128,8 @@ static void 
>> lookup_events_by_type_and_signal(struct kfd_process *p,
>>         if (type == KFD_EVENT_TYPE_MEMORY) {
>>           dev_warn(kfd_device,
>> -            "Sending SIGSEGV to process %d (pasid 0x%x)",
>> -                p->lead_thread->pid, p->pasid);
>> +            "Sending SIGSEGV to process pid %d",
>> +                p->lead_thread->pid);
>>           send_sig(SIGSEGV, p->lead_thread, 0);
>>       }
>>   @@ -1137,13 +1137,13 @@ static void 
>> lookup_events_by_type_and_signal(struct kfd_process *p,
>>       if (send_signal) {
>>           if (send_sigterm) {
>>               dev_warn(kfd_device,
>> -                "Sending SIGTERM to process %d (pasid 0x%x)",
>> -                    p->lead_thread->pid, p->pasid);
>> +                "Sending SIGTERM to process pid %d",
>> +                    p->lead_thread->pid);
>>               send_sig(SIGTERM, p->lead_thread, 0);
>>           } else {
>>               dev_err(kfd_device,
>> -                "Process %d (pasid 0x%x) got unhandled exception",
>> -                p->lead_thread->pid, p->pasid);
>> +                "Process pid %d got unhandled exception",
>> +                p->lead_thread->pid);
>>           }
>>       }
>>   @@ -1157,7 +1157,7 @@ void kfd_signal_hw_exception_event(u32 pasid)
>>        * to process context, kfd_process could attempt to exit while 
>> we are
>>        * running so the lookup function increments the process ref 
>> count.
>>        */
>> -    struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
>> +    struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL);
>>         if (!p)
>>           return; /* Presumably process exited. */
>> @@ -1166,22 +1166,23 @@ void kfd_signal_hw_exception_event(u32 pasid)
>>       kfd_unref_process(p);
>>   }
>>   -void kfd_signal_vm_fault_event(struct kfd_node *dev, u32 pasid,
>> +void kfd_signal_vm_fault_event(struct kfd_process_device * pdd,
>>                   struct kfd_vm_fault_info *info,
>>                   struct kfd_hsa_memory_exception_data *data)
>>   {
>>       struct kfd_event *ev;
>>       uint32_t id;
>> -    struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
>> +    struct kfd_process *p = pdd->process;
>>       struct kfd_hsa_memory_exception_data memory_exception_data;
>>       int user_gpu_id;
>>         if (!p)
>>           return; /* Presumably process exited. */
>
> This check is probably pointless now. Better to check the pdd pointer 
> in the caller.
note
>
>
>>   -    user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id);
>> +    user_gpu_id = kfd_process_get_user_gpu_id(p, pdd->dev->id);
>>       if (unlikely(user_gpu_id == -EINVAL)) {
>> -        WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n", 
>> dev->id);
>> +        WARN_ONCE(1, "Could not get user_gpu_id from dev->id:%x\n",
>> +              pdd->dev->id);
>>           return;
>>       }
>>   @@ -1218,7 +1219,6 @@ void kfd_signal_vm_fault_event(struct 
>> kfd_node *dev, u32 pasid,
>>           }
>>         rcu_read_unlock();
>> -    kfd_unref_process(p);
>>   }
>>     void kfd_signal_reset_event(struct kfd_node *dev)
>> @@ -1253,7 +1253,8 @@ void kfd_signal_reset_event(struct kfd_node *dev)
>>           }
>>             if (unlikely(!pdd)) {
>> -            WARN_ONCE(1, "Could not get device data from 
>> pasid:0x%x\n", p->pasid);
>> +            WARN_ONCE(1, "Could not get device data from process 
>> pid:0x%x\n",
>> +                  p->lead_thread->pid);
>>               continue;
>>           }
>>   @@ -1262,8 +1263,15 @@ void kfd_signal_reset_event(struct kfd_node 
>> *dev)
>>             if (dev->dqm->detect_hang_count) {
>>               struct amdgpu_task_info *ti;
>> +             struct amdgpu_fpriv *drv_priv;
>
> Incorrect indentation.
note.
>
>
>> +
>> +            if (unlikely(amdgpu_file_to_fpriv(pdd->drm_file, 
>> &drv_priv))) {
>> +                WARN_ONCE(1, "Could not get vm for device %x from 
>> pid:0x%x\n",
>> +                      dev->id, p->lead_thread->pid);
>> +                continue;
>> +            }
>>   -            ti = amdgpu_vm_get_task_info_pasid(dev->adev, p->pasid);
>> +            ti = amdgpu_vm_get_task_info_vm(&drv_priv->vm);
>>               if (ti) {
>>                   dev_err(dev->adev->dev,
>>                       "Queues reset on process %s tid %d thread %s 
>> pid %d\n",
>> @@ -1300,7 +1308,7 @@ void kfd_signal_reset_event(struct kfd_node *dev)
>>     void kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 
>> pasid)
>>   {
>> -    struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
>> +    struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL);
>>       struct kfd_hsa_memory_exception_data memory_exception_data;
>>       struct kfd_hsa_hw_exception_data hw_exception_data;
>>       struct kfd_event *ev;
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
>> index b3f988b275a8..c5f97e6e36ff 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c
>> @@ -194,7 +194,7 @@ static void 
>> event_interrupt_poison_consumption_v11(struct kfd_node *dev,
>>       enum amdgpu_ras_block block = 0;
>>       int ret = -EINVAL;
>>       uint32_t reset = 0;
>> -    struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
>> +    struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL);
>>         if (!p)
>>           return;
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
>> index d46a13156ee9..7f4d688b8ab9 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
>> @@ -146,7 +146,7 @@ static void 
>> event_interrupt_poison_consumption_v9(struct kfd_node *dev,
>>   {
>>       enum amdgpu_ras_block block = 0;
>>       uint32_t reset = 0;
>> -    struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
>> +    struct kfd_process *p = kfd_lookup_process_by_pasid(pasid, NULL);
>>       enum ras_event_type type = RAS_EVENT_TYPE_POISON_CONSUMPTION;
>>       u64 event_id;
>>       int old_poison, ret;
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
>> index 1f9f5bfeaf86..d56525201155 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c
>> @@ -47,7 +47,7 @@ static int pm_map_process_v9(struct packet_manager 
>> *pm,
>>           packet->bitfields2.exec_cleaner_shader = 1;
>>       packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0;
>>       packet->bitfields2.process_quantum = 10;
>> -    packet->bitfields2.pasid = qpd->pqm->process->pasid;
>> +    packet->bitfields2.pasid = pdd->pasid;
>>       packet->bitfields14.gds_size = qpd->gds_size & 0x3F;
>>       packet->bitfields14.gds_size_hi = (qpd->gds_size >> 6) & 0xF;
>>       packet->bitfields14.num_gws = (qpd->mapped_gws_queue) ? 
>> qpd->num_gws : 0;
>> @@ -106,7 +106,7 @@ static int pm_map_process_aldebaran(struct 
>> packet_manager *pm,
>>           packet->bitfields2.exec_cleaner_shader = 1;
>>       packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0;
>>       packet->bitfields2.process_quantum = 10;
>> -    packet->bitfields2.pasid = qpd->pqm->process->pasid;
>> +    packet->bitfields2.pasid = pdd->pasid;
>>       packet->bitfields14.gds_size = qpd->gds_size & 0x3F;
>>       packet->bitfields14.gds_size_hi = (qpd->gds_size >> 6) & 0xF;
>>       packet->bitfields14.num_gws = (qpd->mapped_gws_queue) ? 
>> qpd->num_gws : 0;
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_vi.c 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_vi.c
>> index c1199d06d131..347c86e1c378 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_vi.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_vi.c
>> @@ -42,6 +42,7 @@ unsigned int pm_build_pm4_header(unsigned int 
>> opcode, size_t packet_size)
>>   static int pm_map_process_vi(struct packet_manager *pm, uint32_t 
>> *buffer,
>>                   struct qcm_process_device *qpd)
>>   {
>> +    struct kfd_process_device *pdd = qpd_to_pdd(qpd);
>>       struct pm4_mes_map_process *packet;
>>         packet = (struct pm4_mes_map_process *)buffer;
>> @@ -52,7 +53,7 @@ static int pm_map_process_vi(struct packet_manager 
>> *pm, uint32_t *buffer,
>>                       sizeof(struct pm4_mes_map_process));
>>       packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0;
>>       packet->bitfields2.process_quantum = 10;
>> -    packet->bitfields2.pasid = qpd->pqm->process->pasid;
>> +    packet->bitfields2.pasid = pdd->pasid;
>>       packet->bitfields3.page_table_base = qpd->page_table_base;
>>       packet->bitfields10.gds_size = qpd->gds_size;
>>       packet->bitfields10.num_gws = qpd->num_gws;
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> index 9e5ca0b93b2a..6aac2295834f 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> @@ -849,6 +849,8 @@ struct kfd_process_device {
>>         /* Tracks queue reset status */
>>       bool has_reset_queue;
>> +
>> +    u32 pasid;
>>   };
>>     #define qpd_to_pdd(x) container_of(x, struct kfd_process_device, 
>> qpd)
>> @@ -908,8 +910,6 @@ struct kfd_process {
>>       /* We want to receive a notification when the mm_struct is 
>> destroyed */
>>       struct mmu_notifier mmu_notifier;
>>   -    u32 pasid;
>> -
>>       /*
>>        * Array of kfd_process_device pointers,
>>        * one for each device the process is using.
>> @@ -970,7 +970,6 @@ struct kfd_process {
>>       /* Kobj for our procfs */
>>       struct kobject *kobj;
>>       struct kobject *kobj_queues;
>> -    struct attribute attr_pasid;
>>         /* Keep track cwsr init */
>>       bool has_cwsr;
>> @@ -1034,7 +1033,9 @@ void kfd_process_destroy_wq(void);
>>   void kfd_cleanup_processes(void);
>>   struct kfd_process *kfd_create_process(struct task_struct *thread);
>>   struct kfd_process *kfd_get_process(const struct task_struct *task);
>> -struct kfd_process *kfd_lookup_process_by_pasid(u32 pasid);
>> +struct kfd_process_device *kfd_lookup_process_device_by_pasid(u32 
>> pasid);
>
> This function is not safe to call. It should be a static function in 
> kfd_process.c only. See below.
ok, as below.
>
>
>> +struct kfd_process *kfd_lookup_process_by_pasid(u32 pasid,
>> +                         struct kfd_process_device **pdd);
>>   struct kfd_process *kfd_lookup_process_by_mm(const struct mm_struct 
>> *mm);
>>     int kfd_process_gpuidx_from_gpuid(struct kfd_process *p, uint32_t 
>> gpu_id);
>> @@ -1331,7 +1332,7 @@ void device_queue_manager_uninit(struct 
>> device_queue_manager *dqm);
>>   struct kernel_queue *kernel_queue_init(struct kfd_node *dev,
>>                       enum kfd_queue_type type);
>>   void kernel_queue_uninit(struct kernel_queue *kq);
>> -int kfd_dqm_evict_pasid(struct device_queue_manager *dqm, u32 pasid);
>> +int kfd_evict_process_device(struct kfd_process_device *pdd);
>>   int kfd_dqm_suspend_bad_queue_mes(struct kfd_node *knode, u32 
>> pasid, u32 doorbell_id);
>>     /* Process Queue Manager */
>> @@ -1486,7 +1487,7 @@ int kfd_event_create(struct file *devkfd, 
>> struct kfd_process *p,
>>   int kfd_get_num_events(struct kfd_process *p);
>>   int kfd_event_destroy(struct kfd_process *p, uint32_t event_id);
>>   -void kfd_signal_vm_fault_event(struct kfd_node *dev, u32 pasid,
>> +void kfd_signal_vm_fault_event(struct kfd_process_device * pdd,
>>                   struct kfd_vm_fault_info *info,
>>                   struct kfd_hsa_memory_exception_data *data);
>>   diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> index 6bab6fc6a35d..93c549bd72d7 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> @@ -282,8 +282,8 @@ static int kfd_get_cu_occupancy(struct attribute 
>> *attr, char *buffer)
>>       cu_cnt = 0;
>>       proc = pdd->process;
>>       if (pdd->qpd.queue_count == 0) {
>> -        pr_debug("Gpu-Id: %d has no active queues for process %d\n",
>> -             dev->id, proc->pasid);
>> +        pr_debug("Gpu-Id: %d has no active queues for process pid 
>> %d\n",
>> +             dev->id, (int)proc->lead_thread->pid);
>>           return snprintf(buffer, PAGE_SIZE, "%d\n", cu_cnt);
>>       }
>>   @@ -327,12 +327,7 @@ static int kfd_get_cu_occupancy(struct 
>> attribute *attr, char *buffer)
>>   static ssize_t kfd_procfs_show(struct kobject *kobj, struct 
>> attribute *attr,
>>                      char *buffer)
>>   {
>> -    if (strcmp(attr->name, "pasid") == 0) {
>> -        struct kfd_process *p = container_of(attr, struct kfd_process,
>> -                             attr_pasid);
>> -
>> -        return snprintf(buffer, PAGE_SIZE, "%d\n", p->pasid);
>> -    } else if (strncmp(attr->name, "vram_", 5) == 0) {
>> +    if (strncmp(attr->name, "vram_", 5) == 0) {
>>           struct kfd_process_device *pdd = container_of(attr, struct 
>> kfd_process_device,
>>                                     attr_vram);
>>           return snprintf(buffer, PAGE_SIZE, "%llu\n", 
>> atomic64_read(&pdd->vram_usage));
>> @@ -887,9 +882,6 @@ struct kfd_process *kfd_create_process(struct 
>> task_struct *thread)
>>               goto out;
>>           }
>>   -        kfd_sysfs_create_file(process->kobj, &process->attr_pasid,
>> -                      "pasid");
>> -
>>           process->kobj_queues = kobject_create_and_add("queues",
>>                               process->kobj);
>>           if (!process->kobj_queues)
>> @@ -1055,8 +1047,8 @@ static void kfd_process_destroy_pdds(struct 
>> kfd_process *p)
>>       for (i = 0; i < p->n_pdds; i++) {
>>           struct kfd_process_device *pdd = p->pdds[i];
>>   -        pr_debug("Releasing pdd (topology id %d) for process 
>> (pasid 0x%x)\n",
>> -                pdd->dev->id, p->pasid);
>> +        pr_debug("Releasing pdd (topology id %d, for pid 0x%0x)\n",
>> +             pdd->dev->id, (int)p->lead_thread->pid);
>>             kfd_process_device_destroy_cwsr_dgpu(pdd);
>>           kfd_process_device_destroy_ib_mem(pdd);
>> @@ -1102,7 +1094,6 @@ static void kfd_process_remove_sysfs(struct 
>> kfd_process *p)
>>       if (!p->kobj)
>>           return;
>>   -    sysfs_remove_file(p->kobj, &p->attr_pasid);
>>       kobject_del(p->kobj_queues);
>>       kobject_put(p->kobj_queues);
>>       p->kobj_queues = NULL;
>> @@ -1171,7 +1162,6 @@ static void kfd_process_wq_release(struct 
>> work_struct *work)
>>         kfd_event_free_process(p);
>>   -    kfd_pasid_free(p->pasid);
>>       mutex_destroy(&p->mutex);
>>         put_task_struct(p->lead_thread);
>> @@ -1524,12 +1514,6 @@ static struct kfd_process 
>> *create_process(const struct task_struct *thread)
>>       atomic_set(&process->debugged_process_count, 0);
>>       sema_init(&process->runtime_enable_sema, 0);
>>   -    process->pasid = kfd_pasid_alloc();
>> -    if (process->pasid == 0) {
>> -        err = -ENOSPC;
>> -        goto err_alloc_pasid;
>> -    }
>> -
>>       err = pqm_init(&process->pqm, process);
>>       if (err != 0)
>>           goto err_process_pqm_init;
>> @@ -1583,8 +1567,6 @@ static struct kfd_process *create_process(const 
>> struct task_struct *thread)
>>   err_init_apertures:
>>       pqm_uninit(&process->pqm);
>>   err_process_pqm_init:
>> -    kfd_pasid_free(process->pasid);
>> -err_alloc_pasid:
>>       kfd_event_free_process(process);
>>   err_event_init:
>>       mutex_destroy(&process->mutex);
>> @@ -1723,15 +1705,14 @@ int kfd_process_device_init_vm(struct 
>> kfd_process_device *pdd,
>>       if (ret)
>>           goto err_init_cwsr;
>>   -    ret = amdgpu_amdkfd_gpuvm_set_vm_pasid(dev->adev, avm, p->pasid);
>> -    if (ret)
>> -        goto err_set_pasid;
>> +    if (amdgpu_amdkfd_gpuvm_get_pasid(pdd, avm) < 0)
>> +        goto err_get_pasid;
>>         pdd->drm_file = drm_file;
>>         return 0;
>>   -err_set_pasid:
>> +err_get_pasid:
>>       kfd_process_device_destroy_cwsr_dgpu(pdd);
>>   err_init_cwsr:
>>       kfd_process_device_destroy_ib_mem(pdd);
>> @@ -1817,25 +1798,53 @@ void 
>> kfd_process_device_remove_obj_handle(struct kfd_process_device *pdd,
>>           idr_remove(&pdd->alloc_idr, handle);
>>   }
>>   -/* This increments the process->ref counter. */
>> -struct kfd_process *kfd_lookup_process_by_pasid(u32 pasid)
>> +struct kfd_process_device *kfd_lookup_process_device_by_pasid(u32 
>> pasid)
>
> You should make this function static. It's not safe to call from 
> anywhere other than kfd_lookup_process_by_pasid without the SRCU 
> locking and process reference counting.

I think it is the main concern for this version. Will have 
kfd_lookup_process_device_by_pasid as static and use 
kfd_lookup_process_by_pasid in other files.

Thanks

Xiaogang

>
> Regards,
>   Felix
>
>
>>   {
>> -    struct kfd_process *p, *ret_p = NULL;
>> -    unsigned int temp;
>>   -    int idx = srcu_read_lock(&kfd_processes_srcu);
>> +    struct kfd_process_device *ret_p = NULL;
>> +    struct kfd_process *p;
>> +    unsigned int temp;
>> +    int i;
>>         hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
>> -        if (p->pasid == pasid) {
>> -            kref_get(&p->ref);
>> -            ret_p = p;
>> -            break;
>> +        for (i = 0; i < p->n_pdds; i++) {
>> +            if (p->pdds[i]->pasid == pasid) {
>> +                ret_p = p->pdds[i];
>> +                break;
>> +            }
>>           }
>> +        if (ret_p)
>> +            break;
>> +    }
>> +
>> +    return ret_p;
>> +
>> +}
>> +
>> +/* This increments the process->ref counter. */
>> +struct kfd_process *kfd_lookup_process_by_pasid(u32 pasid,
>> +                         struct kfd_process_device **pdd)
>> +{
>> +    struct kfd_process_device *ret_p;
>> +
>> +    int idx = srcu_read_lock(&kfd_processes_srcu);
>> +
>> +    ret_p = kfd_lookup_process_device_by_pasid(pasid);
>> +    if (ret_p) {
>> +        if (pdd)
>> +            *pdd = ret_p;
>> +        kref_get(&ret_p->process->ref);
>> +
>> +        srcu_read_unlock(&kfd_processes_srcu, idx);
>> +        return ret_p->process;
>>       }
>>         srcu_read_unlock(&kfd_processes_srcu, idx);
>>   -    return ret_p;
>> +    if (pdd)
>> +        *pdd = NULL;
>> +
>> +    return NULL;
>>   }
>>     /* This increments the process->ref counter. */
>> @@ -1991,7 +2000,7 @@ static void evict_process_worker(struct 
>> work_struct *work)
>>        */
>>       p = container_of(dwork, struct kfd_process, eviction_work);
>>   -    pr_debug("Started evicting pasid 0x%x\n", p->pasid);
>> +    pr_debug("Started evicting process pid 0x%x\n", 
>> (int)p->lead_thread->pid);
>>       ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_TRIGGER_TTM);
>>       if (!ret) {
>>           /* If another thread already signaled the eviction fence,
>> @@ -2003,9 +2012,9 @@ static void evict_process_worker(struct 
>> work_struct *work)
>> msecs_to_jiffies(PROCESS_RESTORE_TIME_MS)))
>>               kfd_process_restore_queues(p);
>>   -        pr_debug("Finished evicting pasid 0x%x\n", p->pasid);
>> +        pr_debug("Finished evicting process pid 0x%x\n", 
>> (int)p->lead_thread->pid);
>>       } else
>> -        pr_err("Failed to evict queues of pasid 0x%x\n", p->pasid);
>> +        pr_err("Failed to evict queues of process pid 0x%x\n", 
>> (int)p->lead_thread->pid);
>>   }
>>     static int restore_process_helper(struct kfd_process *p)
>> @@ -2022,9 +2031,11 @@ static int restore_process_helper(struct 
>> kfd_process *p)
>>         ret = kfd_process_restore_queues(p);
>>       if (!ret)
>> -        pr_debug("Finished restoring pasid 0x%x\n", p->pasid);
>> +        pr_debug("Finished restoring process pid 0x%x\n",
>> +             (int)p->lead_thread->pid);
>>       else
>> -        pr_err("Failed to restore queues of pasid 0x%x\n", p->pasid);
>> +        pr_err("Failed to restore queues of process pid 0x%x\n",
>> +            (int)p->lead_thread->pid);
>>         return ret;
>>   }
>> @@ -2041,7 +2052,7 @@ static void restore_process_worker(struct 
>> work_struct *work)
>>        * lifetime of this thread, kfd_process p will be valid
>>        */
>>       p = container_of(dwork, struct kfd_process, restore_work);
>> -    pr_debug("Started restoring pasid 0x%x\n", p->pasid);
>> +    pr_debug("Started restoring process pasid 0x%x\n", 
>> (int)p->lead_thread->pid);
>>         /* Setting last_restore_timestamp before successful restoration.
>>        * Otherwise this would have to be set by KGD 
>> (restore_process_bos)
>> @@ -2057,8 +2068,8 @@ static void restore_process_worker(struct 
>> work_struct *work)
>>         ret = restore_process_helper(p);
>>       if (ret) {
>> -        pr_debug("Failed to restore BOs of pasid 0x%x, retry after 
>> %d ms\n",
>> -             p->pasid, PROCESS_BACK_OFF_TIME_MS);
>> +        pr_debug("Failed to restore BOs of process pid 0x%x, retry 
>> after %d ms\n",
>> +             (int)p->lead_thread->pid, PROCESS_BACK_OFF_TIME_MS);
>>           if (mod_delayed_work(kfd_restore_wq, &p->restore_work,
>> msecs_to_jiffies(PROCESS_RESTORE_TIME_MS)))
>>               kfd_process_restore_queues(p);
>> @@ -2074,7 +2085,7 @@ void kfd_suspend_all_processes(void)
>>       WARN(debug_evictions, "Evicting all processes");
>>       hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
>>           if (kfd_process_evict_queues(p, 
>> KFD_QUEUE_EVICTION_TRIGGER_SUSPEND))
>> -            pr_err("Failed to suspend process 0x%x\n", p->pasid);
>> +            pr_err("Failed to suspend process pid 0x%x\n", 
>> (int)p->lead_thread->pid);
>>           signal_eviction_fence(p);
>>       }
>>       srcu_read_unlock(&kfd_processes_srcu, idx);
>> @@ -2088,8 +2099,8 @@ int kfd_resume_all_processes(void)
>>         hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
>>           if (restore_process_helper(p)) {
>> -            pr_err("Restore process %d failed during resume\n",
>> -                   p->pasid);
>> +            pr_err("Restore process pid %d failed during resume\n",
>> +                   (int)p->lead_thread->pid);
>>               ret = -EFAULT;
>>           }
>>       }
>> @@ -2144,7 +2155,7 @@ int kfd_process_drain_interrupts(struct 
>> kfd_process_device *pdd)
>>       memset(irq_drain_fence, 0, sizeof(irq_drain_fence));
>>       irq_drain_fence[0] = (KFD_IRQ_FENCE_SOURCEID << 8) |
>>                               KFD_IRQ_FENCE_CLIENTID;
>> -    irq_drain_fence[3] = pdd->process->pasid;
>> +    irq_drain_fence[3] = pdd->pasid;
>>         /*
>>        * For GFX 9.4.3, send the NodeId also in IH cookie DW[3]
>> @@ -2174,7 +2185,7 @@ void kfd_process_close_interrupt_drain(unsigned 
>> int pasid)
>>   {
>>       struct kfd_process *p;
>>   -    p = kfd_lookup_process_by_pasid(pasid);
>> +    p = kfd_lookup_process_by_pasid(pasid, NULL);
>>         if (!p)
>>           return;
>> @@ -2296,7 +2307,7 @@ int kfd_debugfs_mqds_by_process(struct seq_file 
>> *m, void *data)
>>         hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
>>           seq_printf(m, "Process %d PASID 0x%x:\n",
>> -               p->lead_thread->tgid, p->pasid);
>> +               p->lead_thread->tgid, (int)p->lead_thread->pid);
>>             mutex_lock(&p->mutex);
>>           r = pqm_debugfs_mqds(m, &p->pqm);
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>> index c76db22a1000..1f98c16e0fe9 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>> @@ -69,8 +69,8 @@ static int find_available_queue_slot(struct 
>> process_queue_manager *pqm,
>>       pr_debug("The new slot id %lu\n", found);
>>         if (found >= KFD_MAX_NUM_OF_QUEUES_PER_PROCESS) {
>> -        pr_info("Cannot open more queues for process with pasid 
>> 0x%x\n",
>> -                pqm->process->pasid);
>> +        pr_info("Cannot open more queues for process with pid 0x%x\n",
>> +            pqm->process->lead_thread->pid);
>>           return -ENOMEM;
>>       }
>>   @@ -425,8 +425,8 @@ int pqm_create_queue(struct 
>> process_queue_manager *pqm,
>>       }
>>         if (retval != 0) {
>> -        pr_err("Pasid 0x%x DQM create queue type %d failed. ret %d\n",
>> -            pqm->process->pasid, type, retval);
>> +        pr_err("process pid 0x%x DQM create queue type %d failed. 
>> ret %d\n",
>> +            pqm->process->lead_thread->pid, type, retval);
>>           goto err_create_queue;
>>       }
>>   @@ -520,7 +520,7 @@ int pqm_destroy_queue(struct 
>> process_queue_manager *pqm, unsigned int qid)
>>           retval = dqm->ops.destroy_queue(dqm, &pdd->qpd, pqn->q);
>>           if (retval) {
>>               pr_err("Pasid 0x%x destroy queue %d failed, ret %d\n",
>> -                pqm->process->pasid,
>> +                pdd->pasid,
>>                   pqn->q->properties.queue_id, retval);
>>               if (retval != -ETIME)
>>                   goto err_destroy_queue;
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
>> b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>> index 3e2911895c74..a443c5d8ae1d 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
>> @@ -563,7 +563,8 @@ svm_range_vram_node_new(struct kfd_node *node, 
>> struct svm_range *prange,
>>       int r;
>>         p = container_of(prange->svms, struct kfd_process, svms);
>> -    pr_debug("pasid: %x svms 0x%p [0x%lx 0x%lx]\n", p->pasid, 
>> prange->svms,
>> +    pr_debug("process pid: %x svms 0x%p [0x%lx 0x%lx]\n",
>> +         (int)p->lead_thread->pid, prange->svms,
>>            prange->start, prange->last);
>>         if (svm_range_validate_svm_bo(node, prange))
>> @@ -2969,7 +2970,7 @@ svm_range_restore_pages(struct amdgpu_device 
>> *adev, unsigned int pasid,
>>           return -EFAULT;
>>       }
>>   -    p = kfd_lookup_process_by_pasid(pasid);
>> +    p = kfd_lookup_process_by_pasid(pasid, NULL);
>>       if (!p) {
>>           pr_debug("kfd process not founded pasid 0x%x\n", pasid);
>>           return 0;
>> @@ -3232,7 +3233,8 @@ void svm_range_list_fini(struct kfd_process *p)
>>       struct svm_range *prange;
>>       struct svm_range *next;
>>   -    pr_debug("pasid 0x%x svms 0x%p\n", p->pasid, &p->svms);
>> +    pr_debug("process pid 0x%x svms 0x%p\n", (int)p->lead_thread->pid,
>> +         &p->svms);
>>         cancel_delayed_work_sync(&p->svms.restore_work);
>>   @@ -3255,7 +3257,8 @@ void svm_range_list_fini(struct kfd_process *p)
>>         mutex_destroy(&p->svms.lock);
>>   -    pr_debug("pasid 0x%x svms 0x%p done\n", p->pasid, &p->svms);
>> +    pr_debug("process pid 0x%x svms 0x%p done\n",
>> +        (int)p->lead_thread->pid, &p->svms);
>>   }
>>     int svm_range_list_init(struct kfd_process *p)
>> @@ -3618,8 +3621,8 @@ svm_range_set_attr(struct kfd_process *p, 
>> struct mm_struct *mm,
>>       bool flush_tlb;
>>       int r, ret = 0;
>>   -    pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] pages 0x%llx\n",
>> -         p->pasid, &p->svms, start, start + size - 1, size);
>> +    pr_debug("process pid 0x%x svms 0x%p [0x%llx 0x%llx] pages 
>> 0x%llx\n",
>> +         (int)p->lead_thread->pid, &p->svms, start, start + size - 
>> 1, size);
>>         r = svm_range_check_attr(p, nattr, attrs);
>>       if (r)
>> @@ -3727,8 +3730,8 @@ svm_range_set_attr(struct kfd_process *p, 
>> struct mm_struct *mm,
>>   out:
>>       mutex_unlock(&process_info->lock);
>>   -    pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] done, r=%d\n", 
>> p->pasid,
>> -         &p->svms, start, start + size - 1, r);
>> +    pr_debug("process pid 0x%x svms 0x%p [0x%llx 0x%llx] done, r=%d\n",
>> +         (int)p->lead_thread->pid, &p->svms, start, start + size - 
>> 1, r);
>>         return ret ? ret : r;
>>   }


More information about the amd-gfx mailing list