[PATCH] drm/amdgpu: change vm->task_info handling
Shashank Sharma
shashank.sharma at amd.com
Wed Jan 3 09:47:06 UTC 2024
Hey Felix,
On 02/01/2024 19:02, Felix Kuehling wrote:
>
> On 2024-01-02 06:12, Shashank Sharma wrote:
>> drm/amdgpu: change vm->task_info handling
>>
>> This patch changes the handling and lifecycle of vm->task_info object.
>> The major changes are:
>> - vm->task_info is a dynamically allocated ptr now, and its uasge is
>> reference counted.
>> - introducing two new helper funcs for task_info lifecycle management
>> - amdgpu_vm_get_task_info: reference counts up task_info before
>> returning this info
>> - amdgpu_vm_put_task_info: reference counts down task_info
>> - last put to task_info() frees task_info from the vm.
>>
>> This patch also does logistical changes required for existing usage
>> of vm->task_info.
>>
>> Cc: Christian Koenig <christian.koenig at amd.com>
>> Cc: Alex Deucher <alexander.deucher at amd.com>
>> Cc: Felix Kuehling <Felix.Kuehling at amd.com>
>> Signed-off-by: Shashank Sharma <shashank.sharma at amd.com>
>> ---
>> drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 7 +-
>> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 15 ++-
>> drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 17 ++-
>> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 142 +++++++++++++++++---
>> drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 24 +++-
>> drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 2 +-
>> drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 27 ++--
>> drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 28 ++--
>> drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c | 26 ++--
>> drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 28 ++--
>> drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 20 +--
>> drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 19 +--
>> drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 17 +--
>> 13 files changed, 259 insertions(+), 113 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> index a4faea4fa0b5..111f8afb03a1 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
>> @@ -1763,9 +1763,12 @@ static int amdgpu_debugfs_vm_info_show(struct
>> seq_file *m, void *unused)
>> list_for_each_entry(file, &dev->filelist, lhead) {
>> struct amdgpu_fpriv *fpriv = file->driver_priv;
>> struct amdgpu_vm *vm = &fpriv->vm;
>> + struct amdgpu_task_info *ti;
>> +
>> + ti = amdgpu_vm_get_task_info_vm(vm);
>> + seq_printf(m, "pid:%d\tProcess:%s ----------\n", ti->pid,
>> ti->process_name);
>> + amdgpu_vm_put_task_info_vm(ti, vm);
>> - seq_printf(m, "pid:%d\tProcess:%s ----------\n",
>> - vm->task_info.pid, vm->task_info.process_name);
>> r = amdgpu_bo_reserve(vm->root.bo, true);
>> if (r)
>> break;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 2b8356699f23..00516fa178b5 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -4952,10 +4952,17 @@ int amdgpu_do_asic_reset(struct list_head
>> *device_list_handle,
>> tmp_adev->reset_vram_lost = vram_lost;
>> memset(&tmp_adev->reset_task_info, 0,
>> sizeof(tmp_adev->reset_task_info));
>> - if (reset_context->job && reset_context->job->vm)
>> - tmp_adev->reset_task_info =
>> - reset_context->job->vm->task_info;
>> - amdgpu_reset_capture_coredumpm(tmp_adev);
>> + if (reset_context->job && reset_context->job->vm) {
>> + struct amdgpu_task_info *ti;
>> + struct amdgpu_vm *vm = reset_context->job->vm;
>> +
>> + ti = amdgpu_vm_get_task_info_vm(vm);
>> + if (ti) {
>> + tmp_adev->reset_task_info = *ti;
>> + amdgpu_reset_capture_coredumpm(tmp_adev);
>> + amdgpu_vm_put_task_info_vm(ti, vm);
>> + }
>> + }
>> #endif
>> if (vram_lost) {
>> DRM_INFO("VRAM is lost due to GPU reset!\n");
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> index 78476bc75b4e..b89ee6ab7db9 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> @@ -35,7 +35,7 @@ static enum drm_gpu_sched_stat
>> amdgpu_job_timedout(struct drm_sched_job *s_job)
>> {
>> struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
>> struct amdgpu_job *job = to_amdgpu_job(s_job);
>> - struct amdgpu_task_info ti;
>> + struct amdgpu_task_info *ti;
>> struct amdgpu_device *adev = ring->adev;
>> int idx;
>> int r;
>> @@ -58,12 +58,15 @@ static enum drm_gpu_sched_stat
>> amdgpu_job_timedout(struct drm_sched_job *s_job)
>> goto exit;
>> }
>> - amdgpu_vm_get_task_info(ring->adev, job->pasid, &ti);
>> - DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n",
>> - job->base.sched->name,
>> atomic_read(&ring->fence_drv.last_seq),
>> - ring->fence_drv.sync_seq);
>> - DRM_ERROR("Process information: process %s pid %d thread %s pid
>> %d\n",
>> - ti.process_name, ti.tgid, ti.task_name, ti.pid);
>> + ti = amdgpu_vm_get_task_info_pasid(ring->adev, job->pasid);
>> + if (ti) {
>> + DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n",
>> + job->base.sched->name,
>> atomic_read(&ring->fence_drv.last_seq),
>> + ring->fence_drv.sync_seq);
>> + DRM_ERROR("Process information: process %s pid %d thread %s
>> pid %d\n",
>> + ti->process_name, ti->tgid, ti->task_name, ti->pid);
>> + amdgpu_vm_put_task_info_pasid(ring->adev, ti, job->pasid);
>> + }
>> dma_fence_set_error(&s_job->s_fence->finished, -ETIME);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> index 82f25996ff5e..0d655bffad7f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>> @@ -2332,6 +2332,9 @@ void amdgpu_vm_fini(struct amdgpu_device *adev,
>> struct amdgpu_vm *vm)
>> amdgpu_amdkfd_gpuvm_destroy_cb(adev, vm);
>> + if (vm->task_info)
>> + amdgpu_vm_put_task_info_vm(vm->task_info, vm);
>> +
>> flush_work(&vm->pt_free_work);
>> root = amdgpu_bo_ref(vm->root.bo);
>> @@ -2492,26 +2495,129 @@ int amdgpu_vm_ioctl(struct drm_device *dev,
>> void *data, struct drm_file *filp)
>> return 0;
>> }
>> +static void amdgpu_vm_destroy_task_info(struct kref *kref)
>> +{
>> + struct amdgpu_task_info *ti = container_of(kref, struct
>> amdgpu_task_info, refcount);
>> +
>> + kfree(ti);
>> +}
>> +
>> +static inline struct amdgpu_vm *
>> +amdgpu_vm_get_vm_from_pasid(struct amdgpu_device *adev, u32 pasid)
>> +{
>> + struct amdgpu_vm *vm;
>> + unsigned long flags;
>> +
>> + xa_lock_irqsave(&adev->vm_manager.pasids, flags);
>> + vm = xa_load(&adev->vm_manager.pasids, pasid);
>> + xa_unlock_irqrestore(&adev->vm_manager.pasids, flags);
>> +
>> + return vm;
>> +}
>> +
>> +/**
>> + * amdgpu_vm_put_task_info_pasid - reference down the vm task_info ptr
>> + * frees the vm task_info ptr at the last put
>> + *
>> + * @adev: drm device pointer
>> + * @task_info: task_info struct under discussion.
>> + * @pasid: pasid of the VM which contains task_info
>> + */
>> +void amdgpu_vm_put_task_info_pasid(struct amdgpu_device *adev,
>> + struct amdgpu_task_info *task_info,
>> + u32 pasid)
>> +{
>> + int ret;
>> +
>> + ret = kref_put(&task_info->refcount, amdgpu_vm_destroy_task_info);
>> +
>> + /* Clean up if object was removed in the last put */
>> + if (ret == 1) {
>> + struct amdgpu_vm *vm;
>> +
>> + vm = amdgpu_vm_get_vm_from_pasid(adev, pasid);
>> + if (!vm) {
>> + WARN(1, "Invalid PASID %u to put task info\n", pasid);
>> + return;
>> + }
>> +
>> + vm->task_info = NULL;
>> + }
>> +}
>> +
>> +/**
>> + * amdgpu_vm_put_task_info_vm - reference down the vm task_info ptr
>> + * frees the vm task_info ptr at the last refdown
>> + *
>> + * @task_info: task_info struct under discussion.
>> + * @vm: VM which contains task_info
>> + */
>> +void amdgpu_vm_put_task_info_vm(struct amdgpu_task_info *task_info,
>> struct amdgpu_vm *vm)
>> +{
>> + int ret;
>> +
>> + ret = kref_put(&task_info->refcount, amdgpu_vm_destroy_task_info);
>> +
>> + /* Clean up if object was removed in the last put */
>> + if (ret == 1)
>> + vm->task_info = NULL;
>> +}
>> +
>> /**
>> - * amdgpu_vm_get_task_info - Extracts task info for a PASID.
>> + * amdgpu_vm_get_task_info_pasid - Extracts task info for a PASID.
>> *
>> * @adev: drm device pointer
>> * @pasid: PASID identifier for VM
>> - * @task_info: task_info to fill.
>> + *
>> + * Returns the respective task_info structure, which must be referenced
>> + * down with amdgpu_vm_put_task_info.
>> */
>> -void amdgpu_vm_get_task_info(struct amdgpu_device *adev, u32 pasid,
>> - struct amdgpu_task_info *task_info)
>> +struct amdgpu_task_info *
>> +amdgpu_vm_get_task_info_pasid(struct amdgpu_device *adev, u32 pasid)
>> {
>> struct amdgpu_vm *vm;
>> - unsigned long flags;
>> + struct amdgpu_task_info *ti = NULL;
>> - xa_lock_irqsave(&adev->vm_manager.pasids, flags);
>> + vm = amdgpu_vm_get_vm_from_pasid(adev, pasid);
>> + if (vm) {
>> + ti = vm->task_info;
>> + kref_get(&vm->task_info->refcount);
>> + }
>> - vm = xa_load(&adev->vm_manager.pasids, pasid);
>> - if (vm)
>> - *task_info = vm->task_info;
>> + return ti;
>> +}
>> - xa_unlock_irqrestore(&adev->vm_manager.pasids, flags);
>> +/**
>> + * amdgpu_vm_get_task_info_vm - Extracts task info for a vm.
>> + *
>> + * @vm: VM to get info from
>> + *
>> + * Returns the respective task_info structure, which must be referenced
>> + * down with amdgpu_vm_put_task_info.
>> + */
>> +struct amdgpu_task_info *
>> +amdgpu_vm_get_task_info_vm(struct amdgpu_vm *vm)
>> +{
>> + struct amdgpu_task_info *ti = NULL;
>> +
>> + if (vm) {
>> + ti = vm->task_info;
>> + kref_get(&vm->task_info->refcount);
>> + }
>> +
>> + return ti;
>> +}
>> +
>> +static int amdgpu_vm_create_task_info(struct amdgpu_vm *vm)
>> +{
>> + vm->task_info = kzalloc(sizeof(struct amdgpu_task_info),
>> GFP_KERNEL);
>> + if (!vm->task_info) {
>> + DRM_ERROR("OOM while creating task_info space\n");
>> + return -ENOMEM;
>> + }
>> +
>> + kref_init(&vm->task_info->refcount);
>> + return 0;
>> }
>> /**
>> @@ -2521,17 +2627,23 @@ void amdgpu_vm_get_task_info(struct
>> amdgpu_device *adev, u32 pasid,
>> */
>> void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
>> {
>> - if (vm->task_info.pid)
>> + if (!vm->task_info) {
>> + if (amdgpu_vm_create_task_info(vm))
>> + /* OOM */
>> + return;
>> + }
>> +
>> + if (vm->task_info->pid == current->pid)
>> return;
>> - vm->task_info.pid = current->pid;
>> - get_task_comm(vm->task_info.task_name, current);
>> + vm->task_info->pid = current->pid;
>> + get_task_comm(vm->task_info->task_name, current);
>> if (current->group_leader->mm != current->mm)
>> return;
>> - vm->task_info.tgid = current->group_leader->pid;
>> - get_task_comm(vm->task_info.process_name, current->group_leader);
>> + vm->task_info->tgid = current->group_leader->pid;
>> + get_task_comm(vm->task_info->process_name, current->group_leader);
>> }
>> /**
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> index 204ab13184ed..c2dce85d4f9e 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
>> @@ -183,10 +183,11 @@ struct amdgpu_vm_pte_funcs {
>> };
>> struct amdgpu_task_info {
>> - char process_name[TASK_COMM_LEN];
>> - char task_name[TASK_COMM_LEN];
>> - pid_t pid;
>> - pid_t tgid;
>> + char process_name[TASK_COMM_LEN];
>> + char task_name[TASK_COMM_LEN];
>> + pid_t pid;
>> + pid_t tgid;
>> + struct kref refcount;
>> };
>> /**
>> @@ -334,7 +335,7 @@ struct amdgpu_vm {
>> uint64_t pd_phys_addr;
>> /* Some basic info about the task */
>> - struct amdgpu_task_info task_info;
>> + struct amdgpu_task_info *task_info;
>> /* Store positions of group of BOs */
>> struct ttm_lru_bulk_move lru_bulk_move;
>> @@ -466,8 +467,17 @@ bool amdgpu_vm_need_pipeline_sync(struct
>> amdgpu_ring *ring,
>> struct amdgpu_job *job);
>> void amdgpu_vm_check_compute_bug(struct amdgpu_device *adev);
>> -void amdgpu_vm_get_task_info(struct amdgpu_device *adev, u32 pasid,
>> - struct amdgpu_task_info *task_info);
>> +struct amdgpu_task_info *
>> +amdgpu_vm_get_task_info_pasid(struct amdgpu_device *adev, u32 pasid);
>> +
>> +void amdgpu_vm_put_task_info_pasid(struct amdgpu_device *adev,
>> + struct amdgpu_task_info *task_info,
>> + u32 pasid);
>> +
>> +struct amdgpu_task_info *amdgpu_vm_get_task_info_vm(struct amdgpu_vm
>> *vm);
>> +
>> +void amdgpu_vm_put_task_info_vm(struct amdgpu_task_info *task_info,
>> struct amdgpu_vm *vm);
>> +
>> bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
>> u32 vmid, u32 node_id, uint64_t addr,
>> bool write_fault);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
>> index 96d601e209b8..f8323957d8bf 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
>> @@ -1032,7 +1032,7 @@ int amdgpu_vm_ptes_update(struct
>> amdgpu_vm_update_params *params,
>> trace_amdgpu_vm_update_ptes(params, frag_start, upd_end,
>> min(nptes, 32u), dst, incr,
>> upd_flags,
>> - vm->task_info.tgid,
>> + vm->task_info ? vm->task_info->tgid : 0,
>> vm->immediate.fence_context);
>> amdgpu_vm_pte_update_flags(params, to_amdgpu_bo_vm(pt),
>> cursor.level, pe_start, dst,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> index fa87a85e1017..12c624876243 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> @@ -107,7 +107,7 @@ static int gmc_v10_0_process_interrupt(struct
>> amdgpu_device *adev,
>> struct amdgpu_vmhub *hub = &adev->vmhub[vmhub_index];
>> bool retry_fault = !!(entry->src_data[1] & 0x80);
>> bool write_fault = !!(entry->src_data[1] & 0x20);
>> - struct amdgpu_task_info task_info;
>> + struct amdgpu_task_info *task_info;
>> uint32_t status = 0;
>> u64 addr;
>> @@ -155,18 +155,19 @@ static int gmc_v10_0_process_interrupt(struct
>> amdgpu_device *adev,
>> if (!printk_ratelimit())
>> return 0;
>> - memset(&task_info, 0, sizeof(struct amdgpu_task_info));
>> - amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
>> -
>> - dev_err(adev->dev,
>> - "[%s] page fault (src_id:%u ring:%u vmid:%u pasid:%u, for
>> process %s pid %d thread %s pid %d)\n",
>> - entry->vmid_src ? "mmhub" : "gfxhub",
>> - entry->src_id, entry->ring_id, entry->vmid,
>> - entry->pasid, task_info.process_name, task_info.tgid,
>> - task_info.task_name, task_info.pid);
>> - dev_err(adev->dev, " in page starting at address 0x%016llx from
>> client 0x%x (%s)\n",
>> - addr, entry->client_id,
>> - soc15_ih_clientid_name[entry->client_id]);
>> + task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
>> + if (task_info) {
> This drops a lot of useful information from entry if task_info is not
> found for any reason. Please separate this so that entry->...
> information can still be printed when there is no task_info.
>
Agree, I will fix this, and all others like this.
- Shashank
>
>> + dev_err(adev->dev,
>> + "[%s] page fault (src_id:%u ring:%u vmid:%u pasid:%u,
>> for process %s pid %d thread %s pid %d)\n",
>> + entry->vmid_src ? "mmhub" : "gfxhub",
>> + entry->src_id, entry->ring_id, entry->vmid,
>> + entry->pasid, task_info->process_name, task_info->tgid,
>> + task_info->task_name, task_info->pid);
>> + dev_err(adev->dev, " in page starting at address 0x%016llx
>> from client 0x%x (%s)\n",
>> + addr, entry->client_id,
>> + soc15_ih_clientid_name[entry->client_id]);
>> + amdgpu_vm_put_task_info_pasid(adev, task_info, entry->pasid);
>> + }
>> if (!amdgpu_sriov_vf(adev))
>> hub->vmhub_funcs->print_l2_protection_fault_status(adev,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
>> index e3b76fd28d15..ec61e371120a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
>> @@ -120,19 +120,21 @@ static int gmc_v11_0_process_interrupt(struct
>> amdgpu_device *adev,
>> }
>> if (printk_ratelimit()) {
>> - struct amdgpu_task_info task_info;
>> -
>> - memset(&task_info, 0, sizeof(struct amdgpu_task_info));
>> - amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
>> -
>> - dev_err(adev->dev,
>> - "[%s] page fault (src_id:%u ring:%u vmid:%u pasid:%u,
>> for process %s pid %d thread %s pid %d)\n",
>> - entry->vmid_src ? "mmhub" : "gfxhub",
>> - entry->src_id, entry->ring_id, entry->vmid,
>> - entry->pasid, task_info.process_name, task_info.tgid,
>> - task_info.task_name, task_info.pid);
>> - dev_err(adev->dev, " in page starting at address 0x%016llx
>> from client %d\n",
>> - addr, entry->client_id);
>> + struct amdgpu_task_info *task_info;
>> +
>> + task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
>> + if (task_info) {
>
> Same as above.
>
>
>> + dev_err(adev->dev,
>> + "[%s] page fault (src_id:%u ring:%u vmid:%u
>> pasid:%u, for process %s pid %d thread %s pid %d)\n",
>> + entry->vmid_src ? "mmhub" : "gfxhub",
>> + entry->src_id, entry->ring_id, entry->vmid,
>> + entry->pasid, task_info->process_name, task_info->tgid,
>> + task_info->task_name, task_info->pid);
>> + dev_err(adev->dev, " in page starting at address
>> 0x%016llx from client %d\n",
>> + addr, entry->client_id);
>> + amdgpu_vm_put_task_info_pasid(adev, task_info,
>> entry->pasid);
>> + }
>> +
>> if (!amdgpu_sriov_vf(adev))
>> hub->vmhub_funcs->print_l2_protection_fault_status(adev, status);
>> }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>> index 5af235202513..a33a5659713d 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>> @@ -1446,18 +1446,20 @@ static int gmc_v8_0_process_interrupt(struct
>> amdgpu_device *adev,
>> gmc_v8_0_set_fault_enable_default(adev, false);
>> if (printk_ratelimit()) {
>> - struct amdgpu_task_info task_info;
>> -
>> - memset(&task_info, 0, sizeof(struct amdgpu_task_info));
>> - amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
>> -
>> - dev_err(adev->dev, "GPU fault detected: %d 0x%08x for
>> process %s pid %d thread %s pid %d\n",
>> - entry->src_id, entry->src_data[0], task_info.process_name,
>> - task_info.tgid, task_info.task_name, task_info.pid);
>> - dev_err(adev->dev, " VM_CONTEXT1_PROTECTION_FAULT_ADDR
>> 0x%08X\n",
>> - addr);
>> - dev_err(adev->dev, " VM_CONTEXT1_PROTECTION_FAULT_STATUS
>> 0x%08X\n",
>> - status);
>> + struct amdgpu_task_info *task_info;
>> +
>> + task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
>> + if (task_info) {
>
> Same as above.
>
>
>> + dev_err(adev->dev, "GPU fault detected: %d 0x%08x for
>> process %s pid %d thread %s pid %d\n",
>> + entry->src_id, entry->src_data[0],
>> task_info->process_name,
>> + task_info->tgid, task_info->task_name, task_info->pid);
>> + dev_err(adev->dev, " VM_CONTEXT1_PROTECTION_FAULT_ADDR
>> 0x%08X\n",
>> + addr);
>> + dev_err(adev->dev, " VM_CONTEXT1_PROTECTION_FAULT_STATUS
>> 0x%08X\n",
>> + status);
>> + amdgpu_vm_put_task_info_pasid(adev, task_info,
>> entry->pasid);
>> + }
>> +
>> gmc_v8_0_vm_decode_fault(adev, status, addr, mc_client,
>> entry->pasid);
>> }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> index f9a5a2c0573e..fefaa57d8669 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> @@ -550,7 +550,7 @@ static int gmc_v9_0_process_interrupt(struct
>> amdgpu_device *adev,
>> bool retry_fault = !!(entry->src_data[1] & 0x80);
>> bool write_fault = !!(entry->src_data[1] & 0x20);
>> uint32_t status = 0, cid = 0, rw = 0;
>> - struct amdgpu_task_info task_info;
>> + struct amdgpu_task_info *task_info;
>> struct amdgpu_vmhub *hub;
>> const char *mmhub_cid;
>> const char *hub_name;
>> @@ -625,19 +625,19 @@ static int gmc_v9_0_process_interrupt(struct
>> amdgpu_device *adev,
>> if (!printk_ratelimit())
>> return 0;
>> -
>> - memset(&task_info, 0, sizeof(struct amdgpu_task_info));
>> - amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
>> -
>> - dev_err(adev->dev,
>> - "[%s] %s page fault (src_id:%u ring:%u vmid:%u pasid:%u, for
>> process %s pid %d thread %s pid %d)\n",
>> - hub_name, retry_fault ? "retry" : "no-retry",
>> - entry->src_id, entry->ring_id, entry->vmid,
>> - entry->pasid, task_info.process_name, task_info.tgid,
>> - task_info.task_name, task_info.pid);
>> - dev_err(adev->dev, " in page starting at address 0x%016llx from
>> IH client 0x%x (%s)\n",
>> - addr, entry->client_id,
>> - soc15_ih_clientid_name[entry->client_id]);
>> + task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
>> + if (task_info) {
>
> Same as above.
>
>
>> + dev_err(adev->dev,
>> + "[%s] %s page fault (src_id:%u ring:%u vmid:%u pasid:%u,
>> for process %s pid %d thread %s pid %d)\n",
>> + hub_name, retry_fault ? "retry" : "no-retry",
>> + entry->src_id, entry->ring_id, entry->vmid,
>> + entry->pasid, task_info->process_name, task_info->tgid,
>> + task_info->task_name, task_info->pid);
>> + dev_err(adev->dev, " in page starting at address 0x%016llx
>> from IH client 0x%x (%s)\n",
>> + addr, entry->client_id,
>> + soc15_ih_clientid_name[entry->client_id]);
>> + amdgpu_vm_put_task_info_pasid(adev, task_info, entry->pasid);
>> + }
>> if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 3))
>> dev_err(adev->dev, " cookie node_id %d fault from die
>> %s%d%s\n",
>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>> b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>> index cd37f45e01a1..02c8d4364c87 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
>> @@ -2112,7 +2112,7 @@ static int sdma_v4_0_print_iv_entry(struct
>> amdgpu_device *adev,
>> struct amdgpu_iv_entry *entry)
>> {
>> int instance;
>> - struct amdgpu_task_info task_info;
>> + struct amdgpu_task_info *task_info;
>> u64 addr;
>> instance = sdma_v4_0_irq_id_to_seq(entry->client_id);
>> @@ -2125,14 +2125,16 @@ static int sdma_v4_0_print_iv_entry(struct
>> amdgpu_device *adev,
>> addr |= ((u64)entry->src_data[1] & 0xf) << 44;
>> memset(&task_info, 0, sizeof(struct amdgpu_task_info));
>> - amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
>> -
>> - dev_dbg_ratelimited(adev->dev,
>> - "[sdma%d] address:0x%016llx src_id:%u ring:%u vmid:%u "
>> - "pasid:%u, for process %s pid %d thread %s pid %d\n",
>> - instance, addr, entry->src_id, entry->ring_id, entry->vmid,
>> - entry->pasid, task_info.process_name, task_info.tgid,
>> - task_info.task_name, task_info.pid);
>> + task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
>> + if (task_info) {
>
> Same as above.
>
>
>> + dev_dbg_ratelimited(adev->dev,
>> + "[sdma%d] address:0x%016llx src_id:%u ring:%u vmid:%u "
>> + "pasid:%u, for process %s pid %d thread %s pid %d\n",
>> + instance, addr, entry->src_id, entry->ring_id, entry->vmid,
>> + entry->pasid, task_info->process_name, task_info->tgid,
>> + task_info->task_name, task_info->pid);
>> + amdgpu_vm_put_task_info_pasid(adev, task_info, entry->pasid);
>> + }
>> return 0;
>> }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
>> b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
>> index f413898dda37..b62a9bef72e2 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
>> @@ -1633,7 +1633,7 @@ static int sdma_v4_4_2_print_iv_entry(struct
>> amdgpu_device *adev,
>> struct amdgpu_iv_entry *entry)
>> {
>> int instance;
>> - struct amdgpu_task_info task_info;
>> + struct amdgpu_task_info *task_info;
>> u64 addr;
>> instance = sdma_v4_4_2_irq_id_to_seq(entry->client_id);
>> @@ -1646,14 +1646,17 @@ static int sdma_v4_4_2_print_iv_entry(struct
>> amdgpu_device *adev,
>> addr |= ((u64)entry->src_data[1] & 0xf) << 44;
>> memset(&task_info, 0, sizeof(struct amdgpu_task_info));
>> - amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
>> + task_info = amdgpu_vm_get_task_info_pasid(adev, entry->pasid);
>> + if (task_info) {
>
> Same as above.
>
> Regards,
> Felix
>
>
>> + dev_dbg_ratelimited(adev->dev,
>> + "[sdma%d] address:0x%016llx src_id:%u ring:%u vmid:%u "
>> + "pasid:%u, for process %s pid %d thread %s pid %d\n",
>> + instance, addr, entry->src_id, entry->ring_id, entry->vmid,
>> + entry->pasid, task_info->process_name, task_info->tgid,
>> + task_info->task_name, task_info->pid);
>> + amdgpu_vm_put_task_info_pasid(adev, task_info, entry->pasid);
>> + }
>> - dev_dbg_ratelimited(adev->dev,
>> - "[sdma%d] address:0x%016llx src_id:%u ring:%u vmid:%u "
>> - "pasid:%u, for process %s pid %d thread %s pid %d\n",
>> - instance, addr, entry->src_id, entry->ring_id, entry->vmid,
>> - entry->pasid, task_info.process_name, task_info.tgid,
>> - task_info.task_name, task_info.pid);
>> return 0;
>> }
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>> index d9953c2b2661..f6ed68fdff81 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
>> @@ -238,16 +238,17 @@ void
>> kfd_smi_event_update_thermal_throttling(struct kfd_node *dev,
>> void kfd_smi_event_update_vmfault(struct kfd_node *dev, uint16_t
>> pasid)
>> {
>> - struct amdgpu_task_info task_info;
>> + struct amdgpu_task_info *task_info;
>> memset(&task_info, 0, sizeof(struct amdgpu_task_info));
>> - amdgpu_vm_get_task_info(dev->adev, pasid, &task_info);
>> - /* Report VM faults from user applications, not retry from
>> kernel */
>> - if (!task_info.pid)
>> - return;
>> -
>> - kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, "%x:%s\n",
>> - task_info.pid, task_info.task_name);
>> + task_info = amdgpu_vm_get_task_info_pasid(dev->adev, pasid);
>> + if (task_info) {
>> + /* Report VM faults from user applications, not retry from
>> kernel */
>> + if (task_info->pid)
>> + kfd_smi_event_add(0, dev, KFD_SMI_EVENT_VMFAULT, "%x:%s\n",
>> + task_info->pid, task_info->task_name);
>> + amdgpu_vm_put_task_info_pasid(dev->adev, task_info, pasid);
>> + }
>> }
>> void kfd_smi_event_page_fault_start(struct kfd_node *node, pid_t
>> pid,
More information about the amd-gfx
mailing list