[PATCH] drm/amdgpu: Add more process info in VM for debug

Sat Dec 15 12:22:56 UTC 2018

Am 15.12.18 um 09:56 schrieb Trigger Huang:
> When debugging VMC page fault and ring hang issues, the detailed
> process information is quite helpful, especially when the issue
> can only be reproduced after a very long time running. With this
> information, only run the specific sub-testcase may also will
> reproduce the issue, which may save a lot of time for debugging.
>
> With this patch, the process information is similar as following.
> 	When VMC page fault issue happened:
> [  142.978417] amdgpu 0000:00:08.0: [gfxhub] VMC page fault (src_id:0 ring:171 vmid:2 pasid:32769),
> [  142.978542] amdgpu 0000:00:08.0: for process ocltst pid 1354 thread ocltst pid 1354, args:./ocltst -m oclperf.so -t OCLPerfDeviceEnqueueEvent,
> [  142.978693] amdgpu 0000:00:08.0:   in page starting at address 0x0000000000000000 from 27
>
> 	When ring hang issue happened:
> [ 1740.047122] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* ring comp_1.0.0 timeout, signaled seq=91571, emitted seq=91572
> [ 1740.050167] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* The job's process information is as below:
> [ 1740.053160] [drm:amdgpu_job_timedout [amdgpu]] *ERROR* Process SDMA pid 2098 thread SDMA pid 2098, cmd line:SDMA --mode goldimage_compare --offscreen --n-swapchain-images 3 --gpu 0 --frontend test_executor --n-test-threads 4
>
> Signed-off-by: Trigger Huang <Trigger.Huang at amd.com>

Well NAK, we intentionally didn't do it this way.

First of all you can't get the process info during VM creating since 
that can happen in X as well.

Second when a timeout happen the VM structure might already be released, 
so using job->vm is illegal here. What we could try is to get the VM 
using the PASID.

And last we don't want to keep the full command line around.

The only valid addition I can see here is to print the thread info when 
the timeout happens.

Regards,
Christian.

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  |  3 ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.c |  8 ++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  | 11 +++++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h  |  1 +
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   | 11 ++++++-----
>   5 files changed, 26 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index 1c49b82..1a2d0c9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -235,9 +235,6 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs
>   		p->job->uf_addr = uf_offset;
>   	kfree(chunk_array);
>   
> -	/* Use this opportunity to fill in task info for the vm */
> -	amdgpu_vm_set_task_info(vm);
> -
>   	return 0;
>   
>   free_all_kdata:
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index e0af44f..c75ecb3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -43,6 +43,14 @@ static void amdgpu_job_timedout(struct drm_sched_job *s_job)
>   		  job->base.sched->name, atomic_read(&ring->fence_drv.last_seq),
>   		  ring->fence_drv.sync_seq);
>   
> +	if (job->vm) {
> +		struct amdgpu_task_info *ti = &(job->vm->task_info);
> +
> +		DRM_ERROR("The job's process information is as below:\n");
> +		DRM_ERROR("Process %s, thread %s, cmd line:%s\n",
> +			ti->process_name, ti->task_name, ti->cmd_line);
> +	}
> +
>   	if (amdgpu_device_should_recover_gpu(ring->adev))
>   		amdgpu_device_gpu_recover(ring->adev, job);
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index e73d152..24f3cbd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -30,6 +30,7 @@
>   #include <linux/idr.h>
>   #include <drm/drmP.h>
>   #include <drm/amdgpu_drm.h>
> +#include <linux/string_helpers.h>
>   #include "amdgpu.h"
>   #include "amdgpu_trace.h"
>   #include "amdgpu_amdkfd.h"
> @@ -3045,6 +3046,8 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
>   			goto error_free_root;
>   
>   		vm->pasid = pasid;
> +
> +		amdgpu_vm_set_task_info(vm);
>   	}
>   
>   	vm->fault_hash = init_fault_hash();
> @@ -3223,6 +3226,9 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
>   		spin_lock_irqsave(&adev->vm_manager.pasid_lock, flags);
>   		idr_remove(&adev->vm_manager.pasid_idr, vm->pasid);
>   		spin_unlock_irqrestore(&adev->vm_manager.pasid_lock, flags);
> +
> +		kfree(vm->task_info.cmd_line);
> +		vm->task_info.cmd_line = NULL;
>   	}
>   
>   	kfree(vm->fault_hash);
> @@ -3391,6 +3397,11 @@ void amdgpu_vm_set_task_info(struct amdgpu_vm *vm)
>   			vm->task_info.tgid = current->group_leader->pid;
>   			get_task_comm(vm->task_info.process_name, current->group_leader);
>   		}
> +
> +		vm->task_info.cmd_line =
> +				kstrdup_quotable_cmdline(current, GFP_KERNEL);
> +		if (!vm->task_info.cmd_line)
> +			DRM_DEBUG_DRIVER("Failed to get cmdline!\n");
>   	}
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> index e8dcfd5..9fab787 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> @@ -175,6 +175,7 @@ struct amdgpu_vm_pte_funcs {
>   struct amdgpu_task_info {
>   	char	process_name[TASK_COMM_LEN];
>   	char	task_name[TASK_COMM_LEN];
> +	char    *cmd_line;
>   	pid_t	pid;
>   	pid_t	tgid;
>   };
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index bacdaef..c3e3558 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -325,11 +325,12 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
>   		amdgpu_vm_get_task_info(adev, entry->pasid, &task_info);
>   
>   		dev_err(adev->dev,
> -			"[%s] VMC page fault (src_id:%u ring:%u vmid:%u pasid:%u, for process %s pid %d thread %s pid %d)\n",
> -			entry->vmid_src ? "mmhub" : "gfxhub",
> -			entry->src_id, entry->ring_id, entry->vmid,
> -			entry->pasid, task_info.process_name, task_info.tgid,
> -			task_info.task_name, task_info.pid);
> +			"[%s] VMC page fault (src_id:%u ring:%u vmid:%u pasid:%u), ",
> +			entry->vmid_src ? "mmhub" : "gfxhub",  entry->src_id,
> +			entry->ring_id, entry->vmid, entry->pasid);
> +		dev_err(adev->dev, "for process %s pid %d thread %s pid %d, args:%s,",
> +			task_info.process_name, task_info.tgid,
> +			task_info.task_name, task_info.pid, task_info.cmd_line);
>   		dev_err(adev->dev, "  in page starting at address 0x%016llx from %d\n",
>   			addr, entry->client_id);
>   		if (!amdgpu_sriov_vf(adev))