[PATCH 1/1] drm/amdkfd: Add eviction debug messages

philip yang yangp at amd.com
Sat Jun 13 02:09:31 UTC 2020


Thanks for the explanation.

Reviewed-by: Philip Yang <Philip.Yang at amd.com>

On 2020-06-12 7:43 p.m., Felix Kuehling wrote:

> Am 2020-06-12 um 6:03 p.m. schrieb philip yang:
>> It's good idea, better to add same print in system memory eviction
>> path amdgpu_amdkfd_evict_userptr.
> That's covered by the message in kgd2kfd_quiesce_mm.
>
>
>> Use WARN_ONCE to avoid duplicate messages.
> I want duplicate messages. If many different kinds of evictions are
> happening I want to see them all. The module parameter is there so I can
> turn it on/off for short bursts while interesting things are happening.
> It's off by default.
>
> I was considering WARN_RATELIMIT, but that may skip interesting
> evictions I actually want to see.
>
> Regards,
>    Felix
>
>
>> Regards,
>>
>> Philip
>>
>>
>> On 2020-06-11 11:34 p.m., Felix Kuehling wrote:
>>> Use WARN to print messages with backtrace when evictions are triggered.
>>> This can help determine the root cause of evictions and help spot driver
>>> bugs triggering evictions unintentionally, or help with performance
>>> tuning
>>> by avoiding conditions that cause evictions in a specific workload.
>>>
>>> The messages are controlled by a new module parameter that can be
>>> changed
>>> at runtime:
>>>
>>>     echo Y > /sys/module/amdgpu/parameters/debug_evictions
>>>     echo N > /sys/module/amdgpu/parameters/debug_evictions
>>>
>>> Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h      | 2 ++
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c  | 8 ++++++++
>>>    drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c | 2 ++
>>>    drivers/gpu/drm/amd/amdkfd/kfd_device.c  | 3 +++
>>>    drivers/gpu/drm/amd/amdkfd/kfd_priv.h    | 5 +++++
>>>    5 files changed, 20 insertions(+)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> index 10ae92e835f6..6c7dd0a707c9 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>>> @@ -186,8 +186,10 @@ extern int amdgpu_noretry;
>>>    extern int amdgpu_force_asic_type;
>>>    #ifdef CONFIG_HSA_AMD
>>>    extern int sched_policy;
>>> +extern bool debug_evictions;
>>>    #else
>>>    static const int sched_policy = KFD_SCHED_POLICY_HWS;
>>> +static const bool debug_evictions; /* = false */
>>>    #endif
>>>      extern int amdgpu_tmz;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> index d4d7cca1cc72..fdf350d5e7b7 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> @@ -705,6 +705,14 @@ MODULE_PARM_DESC(hws_gws_support, "Assume MEC2
>>> FW supports GWS barriers (false =
>>>    int queue_preemption_timeout_ms = 9000;
>>>    module_param(queue_preemption_timeout_ms, int, 0644);
>>>    MODULE_PARM_DESC(queue_preemption_timeout_ms, "queue preemption
>>> timeout in ms (1 = Minimum, 9000 = default)");
>>> +
>>> +/**
>>> + * DOC: debug_evictions(bool)
>>> + * Enable extra debug messages to help determine the cause of evictions
>>> + */
>>> +bool debug_evictions;
>>> +module_param(debug_evictions, bool, 0644);
>>> +MODULE_PARM_DESC(debug_evictions, "enable eviction debug messages
>>> (false = default)");
>>>    #endif
>>>      /**
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
>>> index b87ca171986a..072f0e1185a8 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sync.c
>>> @@ -275,6 +275,8 @@ int amdgpu_sync_resv(struct amdgpu_device *adev,
>>> struct amdgpu_sync *sync,
>>>                continue;
>>>            }
>>>    +        WARN(debug_evictions && fence_owner ==
>>> AMDGPU_FENCE_OWNER_KFD,
>>> +             "Adding eviction fence to sync obj");
>>>            r = amdgpu_sync_fence(sync, f, false);
>>>            if (r)
>>>                break;
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>> index 22348cebaf36..80393e0583bb 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
>>> @@ -942,6 +942,7 @@ int kgd2kfd_quiesce_mm(struct mm_struct *mm)
>>>        if (!p)
>>>            return -ESRCH;
>>>    +    WARN(debug_evictions, "Evicting pid %d", p->lead_thread->pid);
>>>        r = kfd_process_evict_queues(p);
>>>          kfd_unref_process(p);
>>> @@ -1009,6 +1010,8 @@ int
>>> kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
>>>        /* During process initialization eviction_work.dwork is
>>> initialized
>>>         * to kfd_evict_bo_worker
>>>         */
>>> +    WARN(debug_evictions, "Scheduling eviction of pid %d in %ld
>>> jiffies",
>>> +         p->lead_thread->pid, delay_jiffies);
>>>        schedule_delayed_work(&p->eviction_work, delay_jiffies);
>>>    out:
>>>        kfd_unref_process(p);
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> index 173d58b2d81f..51ba2020732e 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>>> @@ -177,6 +177,11 @@ extern bool hws_gws_support;
>>>     */
>>>    extern int queue_preemption_timeout_ms;
>>>    +/*
>>> + * Enable eviction debug messages
>>> + */
>>> +extern bool debug_evictions;
>>> +
>>>    enum cache_policy {
>>>        cache_policy_coherent,
>>>        cache_policy_noncoherent


More information about the amd-gfx mailing list