[PATCH] drm/amdgpu: move task_info to amdgpu_fpriv

Christian König christian.koenig at amd.com
Tue Oct 17 07:34:59 UTC 2023


Am 17.10.23 um 09:25 schrieb Shashank Sharma:
> Hello Christian, Felix,
>
> Thanks for your comments, mine inline.
>
> On 17/10/2023 07:55, Christian König wrote:
>> Am 17.10.23 um 00:15 schrieb Felix Kuehling:
>>> On 2023-10-16 13:08, Shashank Sharma wrote:
>>>> This patch does the following:
>>>> - moves vm->task_info struct to fpriv->task_info.
>>>> - makes task_info allocation dynamic.
>>>> - adds reference counting support for task_info structure.
>>>> - adds some new helper functions to find and put task_info.
>>>> - adds respective supporting changes for existing get_task_info 
>>>> consumers.
>>>>
>>>> Cc: Christian Koenig <christian.koenig at amd.com>
>>>> Cc: Alex Deucher <alexander.deucher at amd.com>
>>>> Signed-off-by: Shashank Sharma <shashank.sharma at amd.com>
>>>> ---
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h         |  2 +
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  4 +-
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  | 28 +++++--
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c     | 16 ++--
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c      | 87 
>>>> ++++++++++++++++-----
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h      |  9 +--
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c   |  5 +-
>>>>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c      | 19 +++--
>>>>   drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c      | 18 +++--
>>>>   drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c       | 17 ++--
>>>>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c       | 19 +++--
>>>>   drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c      | 19 ++---
>>>>   drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c    | 20 ++---
>>>>   drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 14 ++--
>>>>   14 files changed, 186 insertions(+), 91 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> index dc2d53081e80..a90780d38725 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>>> @@ -469,6 +469,8 @@ struct amdgpu_fpriv {
>>>>       struct mutex        bo_list_lock;
>>>>       struct idr        bo_list_handles;
>>>>       struct amdgpu_ctx_mgr    ctx_mgr;
>>>> +    struct amdgpu_task_info *task_info;
>>>> +
>>>>       /** GPU partition selection */
>>>>       uint32_t        xcp_id;
>>>>   };
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>> index a4faea4fa0b5..6e9dcd13ee34 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>>>> @@ -1763,9 +1763,11 @@ static int 
>>>> amdgpu_debugfs_vm_info_show(struct seq_file *m, void *unused)
>>>>       list_for_each_entry(file, &dev->filelist, lhead) {
>>>>           struct amdgpu_fpriv *fpriv = file->driver_priv;
>>>>           struct amdgpu_vm *vm = &fpriv->vm;
>>>> +        struct amdgpu_task_info *task_info = fpriv->task_info;
>>>>             seq_printf(m, "pid:%d\tProcess:%s ----------\n",
>>>> -                vm->task_info.pid, vm->task_info.process_name);
>>>> +                task_info ? task_info->pid : 0,
>>>> +                task_info ? task_info->process_name : "");
>>>>           r = amdgpu_bo_reserve(vm->root.bo, true);
>>>>           if (r)
>>>>               break;
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> index 3f001a50b34a..b372a87b9b77 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> @@ -4872,6 +4872,27 @@ static void 
>>>> amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev)
>>>>       dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_NOWAIT,
>>>>                 amdgpu_devcoredump_read, amdgpu_devcoredump_free);
>>>>   }
>>>> +
>>>> +static void
>>>> +amdgpu_reset_dev_coredump(struct amdgpu_device *adev, struct 
>>>> amdgpu_reset_context *reset_context)
>>>> +{
>>>> +    struct amdgpu_task_info *ti;
>>>> +    struct amdgpu_vm *vm;
>>>> +
>>>> +    if (!reset_context->job || !reset_context->job->vm)
>>>> +        return;
>>>> +
>>>> +    vm = reset_context->job->vm;
>>>> +
>>>> +    /* Get reset task info and save a copy of data to be consumed 
>>>> later */
>>>> +    ti = amdgpu_vm_get_task_info(adev, vm->pasid);
>>>> +    if (ti) {
>>>> +        adev->reset_task_info = *ti;
>>>> +        amdgpu_reset_capture_coredumpm(adev);
>>>> +    }
>>>> +
>>>> +    amdgpu_vm_put_task_info(adev, vm->pasid);
>>>> +}
>>>>   #endif
>>>>     int amdgpu_do_asic_reset(struct list_head *device_list_handle,
>>>> @@ -4976,12 +4997,7 @@ int amdgpu_do_asic_reset(struct list_head 
>>>> *device_list_handle,
>>>>                   vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
>>>>   #ifdef CONFIG_DEV_COREDUMP
>>>>                   tmp_adev->reset_vram_lost = vram_lost;
>>>> -                memset(&tmp_adev->reset_task_info, 0,
>>>> - sizeof(tmp_adev->reset_task_info));
>>>> -                if (reset_context->job && reset_context->job->vm)
>>>> -                    tmp_adev->reset_task_info =
>>>> - reset_context->job->vm->task_info;
>>>> -                amdgpu_reset_capture_coredumpm(tmp_adev);
>>>> +                amdgpu_reset_dev_coredump(tmp_adev, reset_context);
>>>>   #endif
>>>>                   if (vram_lost) {
>>>>                       DRM_INFO("VRAM is lost due to GPU reset!\n");
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>> index 78476bc75b4e..99cf30c0bce6 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>>> @@ -35,7 +35,7 @@ static enum drm_gpu_sched_stat 
>>>> amdgpu_job_timedout(struct drm_sched_job *s_job)
>>>>   {
>>>>       struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
>>>>       struct amdgpu_job *job = to_amdgpu_job(s_job);
>>>> -    struct amdgpu_task_info ti;
>>>> +    struct amdgpu_task_info *ti;
>>>>       struct amdgpu_device *adev = ring->adev;
>>>>       int idx;
>>>>       int r;
>>>> @@ -48,7 +48,6 @@ static enum drm_gpu_sched_stat 
>>>> amdgpu_job_timedout(struct drm_sched_job *s_job)
>>>>           return DRM_GPU_SCHED_STAT_ENODEV;
>>>>       }
>>>>   -    memset(&ti, 0, sizeof(struct amdgpu_task_info));
>>>>       adev->job_hang = true;
>>>>         if (amdgpu_gpu_recovery &&
>>>> @@ -58,12 +57,15 @@ static enum drm_gpu_sched_stat 
>>>> amdgpu_job_timedout(struct drm_sched_job *s_job)
>>>>           goto exit;
>>>>       }
>>>>   -    amdgpu_vm_get_task_info(ring->adev, job->pasid, &ti);
>>>>       DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n",
>>>> -          job->base.sched->name, 
>>>> atomic_read(&ring->fence_drv.last_seq),
>>>> -          ring->fence_drv.sync_seq);
>>>> -    DRM_ERROR("Process information: process %s pid %d thread %s 
>>>> pid %d\n",
>>>> -          ti.process_name, ti.tgid, ti.task_name, ti.pid);
>>>> +           job->base.sched->name, 
>>>> atomic_read(&ring->fence_drv.last_seq),
>>>> +           ring->fence_drv.sync_seq);
>>>> +
>>>> +    ti = amdgpu_vm_get_task_info(ring->adev, job->pasid);
>>>> +    if (ti)
>>>> +        DRM_ERROR("Process information: process %s pid %d thread 
>>>> %s pid %d\n",
>>>> +               ti->process_name, ti->tgid, ti->task_name, ti->pid);
>>>> +    amdgpu_vm_put_task_info(ring->adev, job->pasid);
>>>> dma_fence_set_error(&s_job->s_fence->finished, -ETIME);
>>>>   diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>> index f5daadcec865..aabfbb8edb2b 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>> @@ -2335,6 +2335,7 @@ void amdgpu_vm_fini(struct amdgpu_device 
>>>> *adev, struct amdgpu_vm *vm)
>>>>         root = amdgpu_bo_ref(vm->root.bo);
>>>>       amdgpu_bo_reserve(root, true);
>>>> +    amdgpu_vm_put_task_info(adev, vm->pasid);
>>>>       amdgpu_vm_set_pasid(adev, vm, 0);
>>>>       dma_fence_wait(vm->last_unlocked, false);
>>>>       dma_fence_put(vm->last_unlocked);
>>>> @@ -2491,26 +2492,44 @@ int amdgpu_vm_ioctl(struct drm_device *dev, 
>>>> void *data, struct drm_file *filp)
>>>>       return 0;
>>>>   }
>>>>   +static struct
>>>> +amdgpu_task_info *amdgpu_vm_find_task_info(struct amdgpu_device 
>>>> *adev, u32 pasid)
>>>> +{
>>>> +    unsigned long flags;
>>>> +    struct amdgpu_vm *vm;
>>>> +    struct amdgpu_fpriv *fpriv;
>>>> +    struct amdgpu_task_info *task_info = NULL;
>>>> +
>>>> +    xa_lock_irqsave(&adev->vm_manager.pasids, flags);
>>>> +
>>>> +    vm = xa_load(&adev->vm_manager.pasids, pasid);
>>>> +    if (vm) {
>>>> +        fpriv = container_of(vm, struct amdgpu_fpriv, vm);
>>>> +        task_info = fpriv->task_info;
>>>> +    }
>>>> +
>>>> +    xa_unlock_irqrestore(&adev->vm_manager.pasids, flags);
>>>> +    return task_info;
>>>> +}
>>>> +
>>>>   /**
>>>>    * amdgpu_vm_get_task_info - Extracts task info for a PASID.
>>>>    *
>>>>    * @adev: drm device pointer
>>>>    * @pasid: PASID identifier for VM
>>>> - * @task_info: task_info to fill.
>>>> + *
>>>> + * returns the task_info* (refrence counted) set under the vm_pasid
>>>> + * user must call amdgpu_vm_put_task_info when done with the 
>>>> task_info ptr
>>>>    */
>>>> -void amdgpu_vm_get_task_info(struct amdgpu_device *adev, u32 pasid,
>>>> -             struct amdgpu_task_info *task_info)
>>>> +struct amdgpu_task_info *amdgpu_vm_get_task_info(struct 
>>>> amdgpu_device *adev, u32 pasid)
>>>>   {
>>>> -    struct amdgpu_vm *vm;
>>>> -    unsigned long flags;
>>>> -
>>>> -    xa_lock_irqsave(&adev->vm_manager.pasids, flags);
>>>> +    struct amdgpu_task_info *ti;
>>>>   -    vm = xa_load(&adev->vm_manager.pasids, pasid);
>>>> -    if (vm)
>>>> -        *task_info = vm->task_info;
>>>> +    ti = amdgpu_vm_find_task_info(adev, pasid);
>>>> +    if (ti)
>>>> +        kref_get(&ti->refcount);
>>>>   -    xa_unlock_irqrestore(&adev->vm_manager.pasids, flags);
>>>> +    return ti;
>>>>   }
>>>>     /**
>>>> @@ -2520,17 +2539,49 @@ void amdgpu_vm_get_task_info(struct 
>>>> amdgpu_device *adev, u32 pasid,
>>>>    */
>>>>   void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
>>>>   {
>>>> -    if (vm->task_info.pid)
>>>> -        return;
>>>> +    struct amdgpu_fpriv *fpriv = container_of(vm, struct 
>>>> amdgpu_fpriv, vm);
>>
>> Big NAK to that, no idea why everybody wants to upcast the VM 
>> structure to fpriv.
>>
>> Instead move this into the the fpriv handling in amdgpu_kms.c
> If you see the existing consumers of amdgpu_vm_get_task_info(), in 
> most of the cases they either have only the vm or the pasid available 
> (Please see gmc_v*_0_process_interrupts, sdma_v4_*_print_iv_entry or 
> kfd_smi_event_update_vmfault). Now if we want to move the task_info 
> from vm to fpriv, the only way to do it without breaking these 
> existing consumers, is to get fpriv from VM. If we move this handling 
> into amdgpu_kms.c , then how to handle these consumers who do not have 
> fpriv available with them ?

Well the primary task is to make task_info a stand alone object, if that 
means that it moves to fpriv is only optional.

When we have a lot of use cases which go to the VM with the pasid and 
want the task_info then we should probably keep that pointer there.

But that is then just a helper function something like 
amdgpu_vm_get_task_info_by_pasid() which then also returns a new 
task_info reference which the caller has to drop.

Regards,
Christian.

>>
>>>> +    struct amdgpu_task_info *task_info;
>>>>   -    vm->task_info.pid = current->pid;
>>>> -    get_task_comm(vm->task_info.task_name, current);
>>>> +    if (fpriv->task_info)
>>>> +        return;
>>>>   -    if (current->group_leader->mm != current->mm)
>>>> +    task_info = kzalloc(sizeof(*task_info), GFP_KERNEL);
>>>> +    if (!task_info) {
>>>> +        DRM_ERROR("OOM while task_info creation\n");
>>>>           return;
>>>> +    }
>>>> +
>>>> +    kref_init(&task_info->refcount);
>>>> +    task_info->pid = current->pid;
>>>> +    get_task_comm(task_info->task_name, current);
>>>> +
>>>> +    if (current->group_leader->mm != current->mm) {
>>>> +        task_info->tgid = current->group_leader->pid;
>>>> +        get_task_comm(task_info->process_name, 
>>>> current->group_leader);
>>>> +    }
>>>> +
>>>> +    kref_get(&task_info->refcount);
>>>> +    fpriv->task_info = task_info;
>>>> +}
>>>> +
>>>> +static void amdgpu_vm_free_task_info(struct kref *kref)
>>>> +{
>>>> +    kfree(container_of(kref, struct amdgpu_task_info, refcount));
>>>> +}
>>>> +
>>>> +/**
>>>> + * amdgpu_vm_put_task_info - reference down the task_info ptr.
>>>> + *
>>>> + * @adev: drm device pointer
>>>> + * @pasid: PASID identifier for VM
>>>> + */
>>>> +void amdgpu_vm_put_task_info(struct amdgpu_device *adev, u32 pasid)
>>>> +{
>>>> +    struct amdgpu_task_info *ti;
>>>>   -    vm->task_info.tgid = current->group_leader->pid;
>>>> -    get_task_comm(vm->task_info.process_name, current->group_leader);
>>>> +    ti = amdgpu_vm_find_task_info(adev, pasid);
>>> This doesn't make sense. Anyone trying to drop a reference should 
>>> already have a ti * from a previous amdgpu_vm_get_task_info call. 
>>> You don't need to find it again. Just pass the ti * from the caller 
>>> to this function.
> Agree, I will modify it.
>>
>> Yeah, agree completely. Additional to that one goal here was to 
>> decouple the VM from the task_info.
>>
>> Probably best if you put the task_info structure into amdgpu_reset.h 
>> and the related handling into amdgpu_kms.c or amdgpu_reset.c instead.
>>
> Same question as above.
>
> - Shashank
>
>> Regards,
>> Christian.
>>
>>>
>>> Regards,
>>>   Felix
>>>
>>>
>>>> +    if (ti)
>>>> +        kref_put(&ti->refcount, amdgpu_vm_free_task_info);
>>>>   }
>>>>     /**
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>> index 204ab13184ed..40a8c532a5ed 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>>>> @@ -187,6 +187,7 @@ struct amdgpu_task_info {
>>>>       char    task_name[TASK_COMM_LEN];
>>>>       pid_t    pid;
>>>>       pid_t    tgid;
>>>> +    struct kref refcount;
>>>>   };
>>>>     /**
>>>> @@ -333,9 +334,6 @@ struct amdgpu_vm {
>>>>       /* Valid while the PD is reserved or fenced */
>>>>       uint64_t        pd_phys_addr;
>>>>   -    /* Some basic info about the task */
>>>> -    struct amdgpu_task_info task_info;
>>>> -
>>>>       /* Store positions of group of BOs */
>>>>       struct ttm_lru_bulk_move lru_bulk_move;
>>>>       /* Flag to indicate if VM is used for compute */
>>>> @@ -466,8 +464,9 @@ bool amdgpu_vm_need_pipeline_sync(struct 
>>>> amdgpu_ring *ring,
>>>>                     struct amdgpu_job *job);
>>>>   void amdgpu_vm_check_compute_bug(struct amdgpu_device *adev);
>>>>   -void amdgpu_vm_get_task_info(struct amdgpu_device *adev, u32 pasid,
>>>> -                 struct amdgpu_task_info *task_info);
>>>> +struct amdgpu_task_info *
>>>> +amdgpu_vm_get_task_info(struct amdgpu_device *adev, u32 pasid);
>>>> +void amdgpu_vm_put_task_info(struct amdgpu_device *adev, u32 pasid);
>>>>   bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
>>>>                   u32 vmid, u32 node_id, uint64_t addr,
>>>>                   bool write_fault);
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
>>>> index 96d601e209b8..3d7a9ad963a8 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
>>>> @@ -1023,21 +1023,24 @@ int amdgpu_vm_ptes_update(struct 
>>>> amdgpu_vm_update_params *params,
>>>>               uint64_t upd_end = min(entry_end, frag_end);
>>>>               unsigned int nptes = (upd_end - frag_start) >> shift;
>>>>               uint64_t upd_flags = flags | AMDGPU_PTE_FRAG(frag);
>>>> +            struct amdgpu_task_info *task_info;
>>>>                 /* This can happen when we set higher level PDs to
>>>>                * silent to stop fault floods.
>>>>                */
>>>>               nptes = max(nptes, 1u);
>>>>   +            task_info = amdgpu_vm_get_task_info(adev, vm->pasid);
>>>>               trace_amdgpu_vm_update_ptes(params, frag_start, upd_end,
>>>>                               min(nptes, 32u), dst, incr,
>>>>                               upd_flags,
>>>> -                            vm->task_info.tgid,
>>>> +                            task_info ? task_info->tgid : 0,
>>>> vm->immediate.fence_context);
>>>>               amdgpu_vm_pte_update_flags(params, to_amdgpu_bo_vm(pt),
>>>>                              cursor.level, pe_start, dst,
>>>>                              nptes, incr, upd_flags);
>>>>   +            amdgpu_vm_put_task_info(adev, vm->pasid);
>>>>               pe_start += nptes * 8;
>>>>               dst += nptes * incr;
>>>>   diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>> index fa87a85e1017..14ded13c8b09 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>> @@ -107,7 +107,7 @@ static int gmc_v10_0_process_interrupt(struct 
>>>> amdgpu_device *adev,
>>>>       struct amdgpu_vmhub *hub = &adev->vmhub[vmhub_index];
>>>>       bool retry_fault = !!(entry->src_data[1] & 0x80);
>>>>       bool write_fault = !!(entry->src_data[1] & 0x20);
>>>> -    struct amdgpu_task_info task_info;
>>>> +    struct amdgpu_task_info *task_info;
>>>>       uint32_t status = 0;
>>>>       u64 addr;
>>>>   @@ -155,15 +155,18 @@ static int 
>>>> gmc_v10_0_process_interrupt(struct amdgpu_device *adev,
>>>>       if (!printk_ratelimit())
>>>>           return 0;
>>>>   -    memset(&task_info, 0, sizeof(struct amdgpu_task_info));
>>>> -    amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
>>>> -
>>>> -    dev_err(adev->dev,
>>>> -        "[%s] page fault (src_id:%u ring:%u vmid:%u pasid:%u, for 
>>>> process %s pid %d thread %s pid %d)\n",
>>>> +    dev_err(adev->dev, "[%s] page fault (src_id:%u ring:%u vmid:%u 
>>>> pasid:%u)\n",
>>>>           entry->vmid_src ? "mmhub" : "gfxhub",
>>>>           entry->src_id, entry->ring_id, entry->vmid,
>>>> -        entry->pasid, task_info.process_name, task_info.tgid,
>>>> -        task_info.task_name, task_info.pid);
>>>> +        entry->pasid);
>>>> +
>>>> +    task_info = amdgpu_vm_get_task_info(adev, entry->pasid);
>>>> +    if (task_info)
>>>> +        dev_err(adev->dev, "for process %s pid %d thread %s pid 
>>>> %d\n",
>>>> +                   task_info->process_name, task_info->tgid,
>>>> +                   task_info->task_name, task_info->pid);
>>>> +    amdgpu_vm_put_task_info(adev, entry->pasid);
>>>> +
>>>>       dev_err(adev->dev, "  in page starting at address 0x%016llx 
>>>> from client 0x%x (%s)\n",
>>>>           addr, entry->client_id,
>>>>           soc15_ih_clientid_name[entry->client_id]);
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
>>>> index e3b76fd28d15..2d96567171bc 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
>>>> @@ -120,17 +120,21 @@ static int gmc_v11_0_process_interrupt(struct 
>>>> amdgpu_device *adev,
>>>>       }
>>>>         if (printk_ratelimit()) {
>>>> -        struct amdgpu_task_info task_info;
>>>> -
>>>> -        memset(&task_info, 0, sizeof(struct amdgpu_task_info));
>>>> -        amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
>>>> +        struct amdgpu_task_info *task_info;
>>>>             dev_err(adev->dev,
>>>> -            "[%s] page fault (src_id:%u ring:%u vmid:%u pasid:%u, 
>>>> for process %s pid %d thread %s pid %d)\n",
>>>> +            "[%s] page fault (src_id:%u ring:%u vmid:%u pasid:%u)\n",
>>>>               entry->vmid_src ? "mmhub" : "gfxhub",
>>>>               entry->src_id, entry->ring_id, entry->vmid,
>>>> -            entry->pasid, task_info.process_name, task_info.tgid,
>>>> -            task_info.task_name, task_info.pid);
>>>> +            entry->pasid);
>>>> +
>>>> +        task_info = amdgpu_vm_get_task_info(adev, entry->pasid);
>>>> +        if (task_info)
>>>> +            dev_err(adev->dev, " for process %s pid %d thread %s 
>>>> pid %d\n",
>>>> +                task_info->process_name, task_info->tgid,
>>>> +                task_info->task_name, task_info->pid);
>>>> +        amdgpu_vm_put_task_info(adev, entry->pasid);
>>>> +
>>>>           dev_err(adev->dev, "  in page starting at address 
>>>> 0x%016llx from client %d\n",
>>>>               addr, entry->client_id);
>>>>           if (!amdgpu_sriov_vf(adev))
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>>>> index 5af235202513..8759ef1c5ea5 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>>>> @@ -1446,14 +1446,19 @@ static int 
>>>> gmc_v8_0_process_interrupt(struct amdgpu_device *adev,
>>>>           gmc_v8_0_set_fault_enable_default(adev, false);
>>>>         if (printk_ratelimit()) {
>>>> -        struct amdgpu_task_info task_info;
>>>> +        struct amdgpu_task_info *task_info;
>>>>   -        memset(&task_info, 0, sizeof(struct amdgpu_task_info));
>>>> -        amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
>>>> +        dev_err(adev->dev, "GPU fault detected: %d 0x%08x\n",
>>>> +            entry->src_id, entry->src_data[0]);
>>>> +
>>>> +        task_info = amdgpu_vm_get_task_info(adev, entry->pasid);
>>>> +        if (task_info) {
>>>> +            dev_err(adev->dev, "for process %s pid %d thread %s 
>>>> pid %d\n\n",
>>>> +                task_info->process_name, task_info->tgid,
>>>> +                task_info->task_name, task_info->pid);
>>>> +        }
>>>> +        amdgpu_vm_put_task_info(adev, entry->pasid);
>>>>   -        dev_err(adev->dev, "GPU fault detected: %d 0x%08x for 
>>>> process %s pid %d thread %s pid %d\n",
>>>> -            entry->src_id, entry->src_data[0], 
>>>> task_info.process_name,
>>>> -            task_info.tgid, task_info.task_name, task_info.pid);
>>>>           dev_err(adev->dev, " VM_CONTEXT1_PROTECTION_FAULT_ADDR   
>>>> 0x%08X\n",
>>>>               addr);
>>>>           dev_err(adev->dev, " VM_CONTEXT1_PROTECTION_FAULT_STATUS 
>>>> 0x%08X\n",
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> index f9a5a2c0573e..75b849d69875 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> @@ -550,7 +550,7 @@ static int gmc_v9_0_process_interrupt(struct 
>>>> amdgpu_device *adev,
>>>>       bool retry_fault = !!(entry->src_data[1] & 0x80);
>>>>       bool write_fault = !!(entry->src_data[1] & 0x20);
>>>>       uint32_t status = 0, cid = 0, rw = 0;
>>>> -    struct amdgpu_task_info task_info;
>>>> +    struct amdgpu_task_info *task_info;
>>>>       struct amdgpu_vmhub *hub;
>>>>       const char *mmhub_cid;
>>>>       const char *hub_name;
>>>> @@ -625,16 +625,19 @@ static int gmc_v9_0_process_interrupt(struct 
>>>> amdgpu_device *adev,
>>>>       if (!printk_ratelimit())
>>>>           return 0;
>>>>   -
>>>> -    memset(&task_info, 0, sizeof(struct amdgpu_task_info));
>>>> -    amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
>>>> -
>>>>       dev_err(adev->dev,
>>>> -        "[%s] %s page fault (src_id:%u ring:%u vmid:%u pasid:%u, 
>>>> for process %s pid %d thread %s pid %d)\n",
>>>> +        "[%s] %s page fault (src_id:%u ring:%u vmid:%u pasid:%u)\n",
>>>>           hub_name, retry_fault ? "retry" : "no-retry",
>>>>           entry->src_id, entry->ring_id, entry->vmid,
>>>> -        entry->pasid, task_info.process_name, task_info.tgid,
>>>> -        task_info.task_name, task_info.pid);
>>>> +        entry->pasid);
>>>> +
>>>> +    task_info = amdgpu_vm_get_task_info(adev, entry->pasid);
>>>> +    if (task_info)
>>>> +        dev_err(adev->dev, " process %s pid %d thread %s pid %d\n",
>>>> +            task_info->process_name, task_info->tgid,
>>>> +            task_info->task_name, task_info->pid);
>>>> +    amdgpu_vm_put_task_info(adev, entry->pasid);
>>>> +
>>>>       dev_err(adev->dev, "  in page starting at address 0x%016llx 
>>>> from IH client 0x%x (%s)\n",
>>>>           addr, entry->client_id,
>>>>           soc15_ih_clientid_name[entry->client_id]);
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>>>> index cd37f45e01a1..d3b9fe74332a 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>>>> @@ -2112,7 +2112,7 @@ static int sdma_v4_0_print_iv_entry(struct 
>>>> amdgpu_device *adev,
>>>>                             struct amdgpu_iv_entry *entry)
>>>>   {
>>>>       int instance;
>>>> -    struct amdgpu_task_info task_info;
>>>> +    struct amdgpu_task_info *task_info;
>>>>       u64 addr;
>>>>         instance = sdma_v4_0_irq_id_to_seq(entry->client_id);
>>>> @@ -2124,15 +2124,16 @@ static int sdma_v4_0_print_iv_entry(struct 
>>>> amdgpu_device *adev,
>>>>       addr = (u64)entry->src_data[0] << 12;
>>>>       addr |= ((u64)entry->src_data[1] & 0xf) << 44;
>>>>   -    memset(&task_info, 0, sizeof(struct amdgpu_task_info));
>>>> -    amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
>>>> -
>>>>       dev_dbg_ratelimited(adev->dev,
>>>> -           "[sdma%d] address:0x%016llx src_id:%u ring:%u vmid:%u "
>>>> -           "pasid:%u, for process %s pid %d thread %s pid %d\n",
>>>> -           instance, addr, entry->src_id, entry->ring_id, 
>>>> entry->vmid,
>>>> -           entry->pasid, task_info.process_name, task_info.tgid,
>>>> -           task_info.task_name, task_info.pid);
>>>> +           "[sdma%d] address:0x%016llx src_id:%u ring:%u vmid:%u 
>>>> pasid:%u\n",
>>>> +           instance, addr, entry->src_id, entry->ring_id, 
>>>> entry->vmid, entry->pasid);
>>>> +
>>>> +    task_info = amdgpu_vm_get_task_info(adev, entry->pasid);
>>>> +    if (task_info)
>>>> +        dev_dbg_ratelimited(adev->dev, "for process %s pid %d 
>>>> thread %s pid %d\n",
>>>> +                    task_info->process_name, task_info->tgid,
>>>> +                    task_info->task_name, task_info->pid);
>>>> +    amdgpu_vm_put_task_info(adev, entry->pasid);
>>>>       return 0;
>>>>   }
>>>>   diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
>>>> index f413898dda37..56c2f744d64e 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
>>>> @@ -1633,7 +1633,7 @@ static int sdma_v4_4_2_print_iv_entry(struct 
>>>> amdgpu_device *adev,
>>>>                             struct amdgpu_iv_entry *entry)
>>>>   {
>>>>       int instance;
>>>> -    struct amdgpu_task_info task_info;
>>>> +    struct amdgpu_task_info *task_info;
>>>>       u64 addr;
>>>>         instance = sdma_v4_4_2_irq_id_to_seq(entry->client_id);
>>>> @@ -1645,15 +1645,17 @@ static int 
>>>> sdma_v4_4_2_print_iv_entry(struct amdgpu_device *adev,
>>>>       addr = (u64)entry->src_data[0] << 12;
>>>>       addr |= ((u64)entry->src_data[1] & 0xf) << 44;
>>>>   -    memset(&task_info, 0, sizeof(struct amdgpu_task_info));
>>>> -    amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
>>>> -
>>>>       dev_dbg_ratelimited(adev->dev,
>>>> -           "[sdma%d] address:0x%016llx src_id:%u ring:%u vmid:%u "
>>>> -           "pasid:%u, for process %s pid %d thread %s pid %d\n",
>>>> -           instance, addr, entry->src_id, entry->ring_id, 
>>>> entry->vmid,
>>>> -           entry->pasid, task_info.process_name, task_info.tgid,
>>>> -           task_info.task_name, task_info.pid);
>>>> +           "[sdma%d] address:0x%016llx src_id:%u ring:%u vmid:%u 
>>>> pasid:%u\n",
>>>> +           instance, addr, entry->src_id, entry->ring_id, 
>>>> entry->vmid, entry->pasid);
>>>> +
>>>> +    task_info = amdgpu_vm_get_task_info(adev, entry->pasid);
>>>> +    if (task_info)
>>>> +        dev_dbg_ratelimited(adev->dev, "for process %s pid %d 
>>>> thread %s pid %d\n",
>>>> +                    task_info->process_name, task_info->tgid,
>>>> +                    task_info->task_name, task_info->pid);
>>>> +    amdgpu_vm_put_task_info(adev, entry->pasid);
>>>> +
>>>>       return 0;
>>>>   }
>>>>   diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c 
>>>> b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>>>> index d9953c2b2661..6b51262811f6 100644
>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>>>> @@ -238,16 +238,18 @@ void 
>>>> kfd_smi_event_update_thermal_throttling(struct kfd_node *dev,
>>>>     void kfd_smi_event_update_vmfault(struct kfd_node *dev, 
>>>> uint16_t pasid)
>>>>   {
>>>> -    struct amdgpu_task_info task_info;
>>>> +    struct amdgpu_task_info *task_info;
>>>>   -    memset(&task_info, 0, sizeof(struct amdgpu_task_info));
>>>> -    amdgpu_vm_get_task_info(dev->adev, pasid, &task_info);
>>>>       /* Report VM faults from user applications, not retry from 
>>>> kernel */
>>>> -    if (!task_info.pid)
>>>> -        return;
>>>> +    task_info = amdgpu_vm_get_task_info(dev->adev, pasid);
>>>> +    if (!task_info || !task_info->pid)
>>>> +        goto unref;
>>>>         kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, "%x:%s\n",
>>>> -              task_info.pid, task_info.task_name);
>>>> +              task_info->pid, task_info->task_name);
>>>> +
>>>> +unref:
>>>> +    amdgpu_vm_put_task_info(dev->adev, pasid);
>>>>   }
>>>>     void kfd_smi_event_page_fault_start(struct kfd_node *node, 
>>>> pid_t pid,
>>



More information about the amd-gfx mailing list