[PATCH 1/1] drm/amdkfd: Add eviction debug messages

philip yang yangp at amd.com
Fri Jun 12 22:03:01 UTC 2020


It's good idea, better to add same print in system memory eviction path 
amdgpu_amdkfd_evict_userptr.

Use WARN_ONCE to avoid duplicate messages.

Regards,

Philip


On 2020-06-11 11:34 p.m., Felix Kuehling wrote:
> Use WARN to print messages with backtrace when evictions are triggered.
> This can help determine the root cause of evictions and help spot driver
> bugs triggering evictions unintentionally, or help with performance tuning
> by avoiding conditions that cause evictions in a specific workload.
>
> The messages are controlled by a new module parameter that can be changed
> at runtime:
>
>    echo Y > /sys/module/amdgpu/parameters/debug_evictions
>    echo N > /sys/module/amdgpu/parameters/debug_evictions
>
> Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h      | 2 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c  | 8 ++++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c | 2 ++
>   drivers/gpu/drm/amd/amdkfd/kfd_device.c  | 3 +++
>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h    | 5 +++++
>   5 files changed, 20 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 10ae92e835f6..6c7dd0a707c9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -186,8 +186,10 @@ extern int amdgpu_noretry;
>   extern int amdgpu_force_asic_type;
>   #ifdef CONFIG_HSA_AMD
>   extern int sched_policy;
> +extern bool debug_evictions;
>   #else
>   static const int sched_policy = KFD_SCHED_POLICY_HWS;
> +static const bool debug_evictions; /* = false */
>   #endif
>   
>   extern int amdgpu_tmz;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index d4d7cca1cc72..fdf350d5e7b7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -705,6 +705,14 @@ MODULE_PARM_DESC(hws_gws_support, "Assume MEC2 FW supports GWS barriers (false =
>   int queue_preemption_timeout_ms = 9000;
>   module_param(queue_preemption_timeout_ms, int, 0644);
>   MODULE_PARM_DESC(queue_preemption_timeout_ms, "queue preemption timeout in ms (1 = Minimum, 9000 = default)");
> +
> +/**
> + * DOC: debug_evictions(bool)
> + * Enable extra debug messages to help determine the cause of evictions
> + */
> +bool debug_evictions;
> +module_param(debug_evictions, bool, 0644);
> +MODULE_PARM_DESC(debug_evictions, "enable eviction debug messages (false = default)");
>   #endif
>   
>   /**
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
> index b87ca171986a..072f0e1185a8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
> @@ -275,6 +275,8 @@ int amdgpu_sync_resv(struct amdgpu_device *adev, struct amdgpu_sync *sync,
>   			continue;
>   		}
>   
> +		WARN(debug_evictions && fence_owner == AMDGPU_FENCE_OWNER_KFD,
> +		     "Adding eviction fence to sync obj");
>   		r = amdgpu_sync_fence(sync, f, false);
>   		if (r)
>   			break;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> index 22348cebaf36..80393e0583bb 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
> @@ -942,6 +942,7 @@ int kgd2kfd_quiesce_mm(struct mm_struct *mm)
>   	if (!p)
>   		return -ESRCH;
>   
> +	WARN(debug_evictions, "Evicting pid %d", p->lead_thread->pid);
>   	r = kfd_process_evict_queues(p);
>   
>   	kfd_unref_process(p);
> @@ -1009,6 +1010,8 @@ int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
>   	/* During process initialization eviction_work.dwork is initialized
>   	 * to kfd_evict_bo_worker
>   	 */
> +	WARN(debug_evictions, "Scheduling eviction of pid %d in %ld jiffies",
> +	     p->lead_thread->pid, delay_jiffies);
>   	schedule_delayed_work(&p->eviction_work, delay_jiffies);
>   out:
>   	kfd_unref_process(p);
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> index 173d58b2d81f..51ba2020732e 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
> @@ -177,6 +177,11 @@ extern bool hws_gws_support;
>    */
>   extern int queue_preemption_timeout_ms;
>   
> +/*
> + * Enable eviction debug messages
> + */
> +extern bool debug_evictions;
> +
>   enum cache_policy {
>   	cache_policy_coherent,
>   	cache_policy_noncoherent


More information about the amd-gfx mailing list